From ee1e749a4132218672fcd4b8deb2b816f2c1c257 Mon Sep 17 00:00:00 2001 From: AlongWY Date: Tue, 28 Nov 2023 05:22:46 +0000 Subject: [PATCH] deploy: 72066be21ad467c8ffc76b74c152b38decf3f0ac --- .nojekyll | 0 cache.json | 1 + favicon.ico | Bin 0 -> 15086 bytes index.css | 355 + index.html | 72510 ++++++++++++++++++++++++++++++++++++++++++++++++++ index.js | 39 + 6 files changed, 72905 insertions(+) create mode 100644 .nojekyll create mode 100644 cache.json create mode 100644 favicon.ico create mode 100644 index.css create mode 100644 index.html create mode 100644 index.js diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..5b82c6f6 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-11-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.12023v1","updated":"2023-11-20T18:57:41Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n Language Model Finetuning","summary":" We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on adapting RoBERTa and LLaMA-2 (7B and 70B)\ndemonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and moreover enables\nmore aggressive quantization. For example, on the OpenAssistant benchmark\nLQ-LoRA is able to learn a 2.5-bit LLaMA-2 model that is competitive with a\nmodel finetuned with 4-bit QLoRA. When finetuned on a language modeling\ncalibration dataset, LQ-LoRA can also be used for model compression; in this\nsetting our 2.75-bit LLaMA-2-70B model (which has 2.85 bits on average when\nincluding the low-rank components and requires 27GB of GPU memory) is\ncompetitive with the original model in full precision.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12022v1","updated":"2023-11-20T18:57:34Z","published":"2023-11-20T18:57:34Z","title":"GPQA: A Graduate-Level Google-Proof Q&A Benchmark","summary":" We present GPQA, a challenging dataset of 448 multiple-choice questions\nwritten by domain experts in biology, physics, and chemistry. We ensure that\nthe questions are high-quality and extremely difficult: experts who have or are\npursuing PhDs in the corresponding domains reach 65% accuracy (74% when\ndiscounting clear mistakes the experts identified in retrospect), while highly\nskilled non-expert validators only reach 34% accuracy, despite spending on\naverage over 30 minutes with unrestricted access to the web (i.e., the\nquestions are \"Google-proof\"). The questions are also difficult for\nstate-of-the-art AI systems, with our strongest GPT-4 based baseline achieving\n39% accuracy. If we are to use future AI systems to help us answer very hard\nquestions, for example, when developing new scientific knowledge, we need to\ndevelop scalable oversight methods that enable humans to supervise their\noutputs, which may be difficult even if the supervisors are themselves skilled\nand knowledgeable. The difficulty of GPQA both for skilled non-experts and\nfrontier AI systems should enable realistic scalable oversight experiments,\nwhich we hope can help devise ways for human experts to reliably get truthful\ninformation from AI systems that surpass human capabilities.\n","authors":["David Rein","Betty Li Hou","Asa Cooper Stickland","Jackson Petty","Richard Yuanzhe Pang","Julien Dirani","Julian Michael","Samuel R. Bowman"],"pdf_url":"https://arxiv.org/pdf/2311.12022v1.pdf","comment":"28 pages, 5 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.12015v1","updated":"2023-11-20T18:54:39Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), by integrating observations of human actions to\nfacilitate robotic manipulation. This system analyzes videos of humans\nperforming tasks and creates executable robot programs that incorporate\naffordance insights. The computation starts by analyzing the videos with GPT-4V\nto convert environmental and action details into text, followed by a\nGPT-4-empowered task planner. In the following analyses, vision systems\nreanalyze the video with the task plan. Object names are grounded using an\nopen-vocabulary object detector, while focus on the hand-object relation helps\nto detect the moment of grasping and releasing. This spatiotemporal grounding\nallows the vision systems to further gather affordance data (e.g., grasp type,\nway points, and body postures). Experiments across various scenarios\ndemonstrate this method's efficacy in achieving real robots' operations from\nhuman demonstrations in a zero-shot manner. The prompts of GPT-4V/GPT-4 are\navailable at this project page:\nhttps://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v1.pdf","comment":"8 pages, 10 figures, 1 table. Last updated on November 20th, 2023"},{"id":"http://arxiv.org/abs/2310.15127v2","updated":"2023-11-20T18:51:29Z","published":"2023-10-23T17:31:55Z","title":"Open-Ended Instructable Embodied Agents with Memory-Augmented Large\n Language Models","summary":" Pre-trained and frozen large language models (LLMs) can effectively map\nsimple scene rearrangement instructions to programs over a robot's visuomotor\nfunctions through appropriate few-shot example prompting. To parse open-domain\nnatural language and adapt to a user's idiosyncratic procedures, not known\nduring prompt engineering time, fixed prompts fall short. In this paper, we\nintroduce HELPER, an embodied agent equipped with an external memory of\nlanguage-program pairs that parses free-form human-robot dialogue into action\nprograms through retrieval-augmented LLM prompting: relevant memories are\nretrieved based on the current dialogue, instruction, correction, or VLM\ndescription, and used as in-context prompt examples for LLM querying. The\nmemory is expanded during deployment to include pairs of user's language and\naction plans, to assist future inferences and personalize them to the user's\nlanguage and routines. HELPER sets a new state-of-the-art in the TEACh\nbenchmark in both Execution from Dialog History (EDH) and Trajectory from\nDialogue (TfD), with a 1.7x improvement over the previous state-of-the-art for\nTfD. Our models, code, and video results can be found in our project's website:\nhttps://helper-agent-llm.github.io.\n","authors":["Gabriel Sarch","Yue Wu","Michael J. Tarr","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2310.15127v2.pdf","comment":"Project page with code & videos: https://helper-agent-llm.github.io"},{"id":"http://arxiv.org/abs/2311.11981v1","updated":"2023-11-20T18:16:27Z","published":"2023-11-20T18:16:27Z","title":"H-COAL: Human Correction of AI-Generated Labels for Biomedical Named\n Entity Recognition","summary":" With the rapid advancement of machine learning models for NLP tasks,\ncollecting high-fidelity labels from AI models is a realistic possibility.\nFirms now make AI available to customers via predictions as a service (PaaS).\nThis includes PaaS products for healthcare. It is unclear whether these labels\ncan be used for training a local model without expensive annotation checking by\nin-house experts. In this work, we propose a new framework for Human Correction\nof AI-Generated Labels (H-COAL). By ranking AI-generated outputs, one can\nselectively correct labels and approach gold standard performance (100% human\nlabeling) with significantly less human effort. We show that correcting 5% of\nlabels can close the AI-human performance gap by up to 64% relative\nimprovement, and correcting 20% of labels can close the performance gap by up\nto 86% relative improvement.\n","authors":["Xiaojing Duan","John P. Lalor"],"pdf_url":"https://arxiv.org/pdf/2311.11981v1.pdf","comment":"Presented at Conference on Information Systems and Technology (CIST)\n 2023"},{"id":"http://arxiv.org/abs/2311.11979v1","updated":"2023-11-20T18:12:28Z","published":"2023-11-20T18:12:28Z","title":"On the Potential and Limitations of Few-Shot In-Context Learning to\n Generate Metamorphic Specifications for Tax Preparation Software","summary":" Due to the ever-increasing complexity of income tax laws in the United\nStates, the number of US taxpayers filing their taxes using tax preparation\nsoftware (henceforth, tax software) continues to increase. According to the\nU.S. Internal Revenue Service (IRS), in FY22, nearly 50% of taxpayers filed\ntheir individual income taxes using tax software. Given the legal consequences\nof incorrectly filing taxes for the taxpayer, ensuring the correctness of tax\nsoftware is of paramount importance. Metamorphic testing has emerged as a\nleading solution to test and debug legal-critical tax software due to the\nabsence of correctness requirements and trustworthy datasets. The key idea\nbehind metamorphic testing is to express the properties of a system in terms of\nthe relationship between one input and its slightly metamorphosed twinned\ninput. Extracting metamorphic properties from IRS tax publications is a tedious\nand time-consuming process. As a response, this paper formulates the task of\ngenerating metamorphic specifications as a translation task between properties\nextracted from tax documents - expressed in natural language - to a contrastive\nfirst-order logic form. We perform a systematic analysis on the potential and\nlimitations of in-context learning with Large Language Models(LLMs) for this\ntask, and outline a research agenda towards automating the generation of\nmetamorphic specifications for tax preparation software.\n","authors":["Dananjay Srinivas","Rohan Das","Saeid Tizpaz-Niari","Ashutosh Trivedi","Maria Leonor Pacheco"],"pdf_url":"https://arxiv.org/pdf/2311.11979v1.pdf","comment":"Accepted to the Proceedings of the Natural Legal Language Processing\n Workshop, EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.11976v1","updated":"2023-11-20T18:06:03Z","published":"2023-11-20T18:06:03Z","title":"Context-aware Neural Machine Translation for English-Japanese Business\n Scene Dialogues","summary":" Despite the remarkable advancements in machine translation, the current\nsentence-level paradigm faces challenges when dealing with highly-contextual\nlanguages like Japanese. In this paper, we explore how context-awareness can\nimprove the performance of the current Neural Machine Translation (NMT) models\nfor English-Japanese business dialogues translation, and what kind of context\nprovides meaningful information to improve translation. As business dialogue\ninvolves complex discourse phenomena but offers scarce training resources, we\nadapted a pretrained mBART model, finetuning on multi-sentence dialogue data,\nwhich allows us to experiment with different contexts. We investigate the\nimpact of larger context sizes and propose novel context tokens encoding\nextra-sentential information, such as speaker turn and scene type. We make use\nof Conditional Cross-Mutual Information (CXMI) to explore how much of the\ncontext the model uses and generalise CXMI to study the impact of the\nextra-sentential context. Overall, we find that models leverage both preceding\nsentences and extra-sentential context (with CXMI increasing with context size)\nand we provide a more focused analysis on honorifics translation. Regarding\ntranslation quality, increased source-side context paired with scene and\nspeaker information improves the model performance compared to previous work\nand our context-agnostic baselines, measured in BLEU and COMET metrics.\n","authors":["Sumire Honda","Patrick Fernandes","Chrysoula Zerva"],"pdf_url":"https://arxiv.org/pdf/2311.11976v1.pdf","comment":"MT Summit 2023, research track, link to paper in proceedings:\n https://aclanthology.org/2023.mtsummit-research.23/"},{"id":"http://arxiv.org/abs/2311.11973v1","updated":"2023-11-20T18:01:29Z","published":"2023-11-20T18:01:29Z","title":"Adaptive Training Distributions with Scalable Online Bilevel\n Optimization","summary":" Large neural networks pretrained on web-scale corpora are central to modern\nmachine learning. In this paradigm, the distribution of the large,\nheterogeneous pretraining data rarely matches that of the application domain.\nThis work considers modifying the pretraining distribution in the case where\none has a small sample of data reflecting the targeted test conditions. We\npropose an algorithm motivated by a recent formulation of this setting as an\nonline, bilevel optimization problem. With scalability in mind, our algorithm\nprioritizes computing gradients at training points which are likely to most\nimprove the loss on the targeted distribution. Empirically, we show that in\nsome cases this approach is beneficial over existing strategies from the domain\nadaptation literature but may not succeed in other cases. We propose a simple\ntest to evaluate when our approach can be expected to work well and point\ntowards further research to address current limitations.\n","authors":["David Grangier","Pierre Ablin","Awni Hannun"],"pdf_url":"https://arxiv.org/pdf/2311.11973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11967v1","updated":"2023-11-20T17:47:37Z","published":"2023-11-20T17:47:37Z","title":"Automatic Analysis of Substantiation in Scientific Peer Reviews","summary":" With the increasing amount of problematic peer reviews in top AI conferences,\nthe community is urgently in need of automatic quality control measures. In\nthis paper, we restrict our attention to substantiation -- one popular quality\naspect indicating whether the claims in a review are sufficiently supported by\nevidence -- and provide a solution automatizing this evaluation process. To\nachieve this goal, we first formulate the problem as claim-evidence pair\nextraction in scientific peer reviews, and collect SubstanReview, the first\nannotated dataset for this task. SubstanReview consists of 550 reviews from NLP\nconferences annotated by domain experts. On the basis of this dataset, we train\nan argument mining system to automatically analyze the level of substantiation\nin peer reviews. We also perform data analysis on the SubstanReview dataset to\nobtain meaningful insights on peer reviewing quality in NLP conferences over\nrecent years.\n","authors":["Yanzhu Guo","Guokan Shang","Virgile Rennard","Michalis Vazirgiannis","Chloé Clavel"],"pdf_url":"https://arxiv.org/pdf/2311.11967v1.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.11944v1","updated":"2023-11-20T17:28:02Z","published":"2023-11-20T17:28:02Z","title":"FinanceBench: A New Benchmark for Financial Question Answering","summary":" FinanceBench is a first-of-its-kind test suite for evaluating the performance\nof LLMs on open book financial question answering (QA). It comprises 10,231\nquestions about publicly traded companies, with corresponding answers and\nevidence strings. The questions in FinanceBench are ecologically valid and\ncover a diverse set of scenarios. They are intended to be clear-cut and\nstraightforward to answer to serve as a minimum performance standard. We test\n16 state of the art model configurations (including GPT-4-Turbo, Llama2 and\nClaude2, with vector stores and long context prompts) on a sample of 150 cases\nfrom FinanceBench, and manually review their answers (n=2,400). The cases are\navailable open-source. We show that existing LLMs have clear limitations for\nfinancial QA. Notably, GPT-4-Turbo used with a retrieval system incorrectly\nanswered or refused to answer 81% of questions. While augmentation techniques\nsuch as using longer context window to feed in relevant evidence improve\nperformance, they are unrealistic for enterprise settings due to increased\nlatency and cannot support larger financial documents. We find that all models\nexamined exhibit weaknesses, such as hallucinations, that limit their\nsuitability for use by enterprises.\n","authors":["Pranab Islam","Anand Kannappan","Douwe Kiela","Rebecca Qian","Nino Scherrer","Bertie Vidgen"],"pdf_url":"https://arxiv.org/pdf/2311.11944v1.pdf","comment":"Dataset is available at:\n https://huggingface.co/datasets/PatronusAI/financebench"},{"id":"http://arxiv.org/abs/2311.10217v2","updated":"2023-11-20T17:08:11Z","published":"2023-11-16T22:15:15Z","title":"A Language and Its Dimensions: Intrinsic Dimensions of Language Fractal\n Structures","summary":" The present paper introduces a novel object of study - a language fractal\nstructure. We hypothesize that a set of embeddings of all $n$-grams of a\nnatural language constitutes a representative sample of this fractal set. (We\nuse the term Hailonakea to refer to the sum total of all language fractal\nstructures, over all $n$). The paper estimates intrinsic (genuine) dimensions\nof language fractal structures for the Russian and English languages. To this\nend, we employ methods based on (1) topological data analysis and (2) a minimum\nspanning tree of a data graph for a cloud of points considered (Steele\ntheorem). For both languages, for all $n$, the intrinsic dimensions appear to\nbe non-integer values (typical for fractal sets), close to 9 for both of the\nRussian and English language.\n","authors":["Vasilii A. Gromov","Nikita S. Borodin","Asel S. Yerbolova"],"pdf_url":"https://arxiv.org/pdf/2311.10217v2.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2209.09757v2","updated":"2023-11-20T16:54:55Z","published":"2022-09-20T14:39:12Z","title":"Language Varieties of Italy: Technology Challenges and Opportunities","summary":" Italy is characterized by a one-of-a-kind linguistic diversity landscape in\nEurope, which implicitly encodes local knowledge, cultural traditions, artistic\nexpressions and history of its speakers. However, most local languages and\ndialects in Italy are at risk of disappearing within few generations. The NLP\ncommunity has recently begun to engage with endangered languages, including\nthose of Italy. Yet, most efforts assume that these varieties are\nunder-resourced language monoliths with an established written form and\nhomogeneous functions and needs, and thus highly interchangeable with each\nother and with high-resource, standardized languages. In this paper, we\nintroduce the linguistic context of Italy and challenge the default\nmachine-centric assumptions of NLP for Italy's language varieties. We advocate\nfor a shift in the paradigm from machine-centric to speaker-centric NLP, and\nprovide recommendations and opportunities for work that prioritizes languages\nand their speakers over technological advances. To facilitate the process, we\nfinally propose building a local community towards responsible, participatory\nefforts aimed at supporting vitality of languages and dialects of Italy.\n","authors":["Alan Ramponi"],"pdf_url":"https://arxiv.org/pdf/2209.09757v2.pdf","comment":"Accepted to TACL. This arXiv version is a pre-MIT Press publication\n version"},{"id":"http://arxiv.org/abs/2311.11904v1","updated":"2023-11-20T16:37:45Z","published":"2023-11-20T16:37:45Z","title":"LLMs as Visual Explainers: Advancing Image Classification with Evolving\n Visual Descriptions","summary":" Vision-language models (VLMs) offer a promising paradigm for image\nclassification by comparing the similarity between images and class embeddings.\nA critical challenge lies in crafting precise textual representations for class\nnames. While previous studies have leveraged recent advancements in large\nlanguage models (LLMs) to enhance these descriptors, their outputs often suffer\nfrom ambiguity and inaccuracy. We identify two primary causes: 1) The prevalent\nreliance on textual interactions with LLMs, leading to a mismatch between the\ngenerated text and the visual content in VLMs' latent space - a phenomenon we\nterm the \"explain without seeing\" dilemma. 2) The oversight of the inter-class\nrelationships, resulting in descriptors that fail to differentiate similar\nclasses effectively. To address these issues, we propose a novel image\nclassification framework combining VLMs with LLMs, named Iterative Optimization\nwith Visual Feedback. In particular, our method develops an LLM-based agent,\nemploying an evolutionary optimization strategy to refine class descriptors.\nCrucially, we incorporate visual feedback from VLM classification metrics,\nthereby guiding the optimization process with concrete visual data. Our method\nleads to improving accuracy on a wide range of image classification benchmarks,\nwith 3.47\\% average gains over state-of-the-art methods. We also highlight the\nresulting descriptions serve as explainable and robust features that can\nconsistently improve the performance across various backbone models.\n","authors":["Songhao Han","Le Zhuo","Yue Liao","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2311.11904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.07496v3","updated":"2023-11-20T15:57:43Z","published":"2022-09-15T17:37:08Z","title":"Unsupervised Opinion Summarization Using Approximate Geodesics","summary":" Opinion summarization is the task of creating summaries capturing popular\nopinions from user reviews. In this paper, we introduce Geodesic Summarizer\n(GeoSumm), a novel system to perform unsupervised extractive opinion\nsummarization. GeoSumm involves an encoder-decoder based representation\nlearning model, that generates representations of text as a distribution over\nlatent semantic units. GeoSumm generates these representations by performing\ndictionary learning over pre-trained text representations at multiple decoder\nlayers. We then use these representations to quantify the relevance of review\nsentences using a novel approximate geodesic distance based scoring mechanism.\nWe use the relevance scores to identify popular opinions in order to compose\ngeneral and aspect-specific summaries. Our proposed model, GeoSumm, achieves\nstate-of-the-art performance on three opinion summarization datasets. We\nperform additional experiments to analyze the functioning of our model and\nshowcase the generalization ability of {\\X} across different domains.\n","authors":["Somnath Basu Roy Chowdhury","Nicholas Monath","Avinava Dubey","Amr Ahmed","Snigdha Chaturvedi"],"pdf_url":"https://arxiv.org/pdf/2209.07496v3.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.11861v1","updated":"2023-11-20T15:57:04Z","published":"2023-11-20T15:57:04Z","title":"Generating Valid and Natural Adversarial Examples with Large Language\n Models","summary":" Deep learning-based natural language processing (NLP) models, particularly\npre-trained language models (PLMs), have been revealed to be vulnerable to\nadversarial attacks. However, the adversarial examples generated by many\nmainstream word-level adversarial attack models are neither valid nor natural,\nleading to the loss of semantic maintenance, grammaticality, and human\nimperceptibility. Based on the exceptional capacity of language understanding\nand generation of large language models (LLMs), we propose LLM-Attack, which\naims at generating both valid and natural adversarial examples with LLMs. The\nmethod consists of two stages: word importance ranking (which searches for the\nmost vulnerable words) and word synonym replacement (which substitutes them\nwith their synonyms obtained from LLMs). Experimental results on the Movie\nReview (MR), IMDB, and Yelp Review Polarity datasets against the baseline\nadversarial attack models illustrate the effectiveness of LLM-Attack, and it\noutperforms the baselines in human and GPT-4 evaluation by a significant\nmargin. The model can generate adversarial examples that are typically valid\nand natural, with the preservation of semantic meaning, grammaticality, and\nhuman imperceptibility.\n","authors":["Zimu Wang","Wei Wang","Qi Chen","Qiufeng Wang","Anh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2311.11861v1.pdf","comment":"Submitted to the IEEE for possible publication"},{"id":"http://arxiv.org/abs/2311.11855v1","updated":"2023-11-20T15:50:09Z","published":"2023-11-20T15:50:09Z","title":"Evil Geniuses: Delving into the Safety of LLM-based Agents","summary":" The rapid advancements in large language models (LLMs) have led to a\nresurgence in LLM-based agents, which demonstrate impressive human-like\nbehaviors and cooperative capabilities in various interactions and strategy\nformulations. However, evaluating the safety of LLM-based agents remains a\ncomplex challenge. This paper elaborately conducts a series of manual jailbreak\nprompts along with a virtual chat-powered evil plan development team, dubbed\nEvil Geniuses, to thoroughly probe the safety aspects of these agents. Our\ninvestigation reveals three notable phenomena: 1) LLM-based agents exhibit\nreduced robustness against malicious attacks. 2) the attacked agents could\nprovide more nuanced responses. 3) the detection of the produced improper\nresponses is more challenging. These insights prompt us to question the\neffectiveness of LLM-based attacks on agents, highlighting vulnerabilities at\nvarious levels and within different role specializations within the\nsystem/agent of LLM-based agents. Extensive evaluation and discussion reveal\nthat LLM-based agents face significant challenges in safety and yield insights\nfor future research. Our code is available at\nhttps://github.com/T1aNS1R/Evil-Geniuses.\n","authors":["Yu Tian","Xiao Yang","Jingyuan Zhang","Yinpeng Dong","Hang Su"],"pdf_url":"https://arxiv.org/pdf/2311.11855v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.11846v1","updated":"2023-11-20T15:37:33Z","published":"2023-11-20T15:37:33Z","title":"Deepparse : An Extendable, and Fine-Tunable State-Of-The-Art Library for\n Parsing Multinational Street Addresses","summary":" Segmenting an address into meaningful components, also known as address\nparsing, is an essential step in many applications from record linkage to\ngeocoding and package delivery. Consequently, a lot of work has been dedicated\nto develop accurate address parsing techniques, with machine learning and\nneural network methods leading the state-of-the-art scoreboard. However, most\nof the work on address parsing has been confined to academic endeavours with\nlittle availability of free and easy-to-use open-source solutions.\n This paper presents Deepparse, a Python open-source, extendable, fine-tunable\naddress parsing solution under LGPL-3.0 licence to parse multinational\naddresses using state-of-the-art deep learning algorithms and evaluated on over\n60 countries. It can parse addresses written in any language and use any\naddress standard. The pre-trained model achieves average $99~\\%$ parsing\naccuracies on the countries used for training with no pre-processing nor\npost-processing needed. Moreover, the library supports fine-tuning with new\ndata to generate a custom address parser.\n","authors":["David Beauchemin","Marouane Yassine"],"pdf_url":"https://arxiv.org/pdf/2311.11846v1.pdf","comment":"Accepted in EMNLP 2024 NLP-OSS workshop. arXiv admin note: text\n overlap with arXiv:2006.16152, arXiv:2112.04008"},{"id":"http://arxiv.org/abs/2311.11844v1","updated":"2023-11-20T15:34:45Z","published":"2023-11-20T15:34:45Z","title":"How to Use Large Language Models for Text Coding: The Case of Fatherhood\n Roles in Public Policy Documents","summary":" Recent advances in large language models (LLMs) like GPT-3 and GPT-4 have\nopened up new opportunities for text analysis in political science. They\npromise automation with better results and less programming. In this study, we\nevaluate LLMs on three original coding tasks of non-English political science\ntexts, and we provide a detailed description of a general workflow for using\nLLMs for text coding in political science research. Our use case offers a\npractical guide for researchers looking to incorporate LLMs into their research\non text analysis. We find that, when provided with detailed label definitions\nand coding examples, an LLM can be as good as or even better than a human\nannotator while being much faster (up to hundreds of times), considerably\ncheaper (costing up to 60% less than human coding), and much easier to scale to\nlarge amounts of text. Overall, LLMs present a viable option for most text\ncoding projects.\n","authors":["Lorenzo Lupo","Oscar Magnusson","Dirk Hovy","Elin Naurin","Lena Wängnerud"],"pdf_url":"https://arxiv.org/pdf/2311.11844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11829v1","updated":"2023-11-20T15:04:50Z","published":"2023-11-20T15:04:50Z","title":"System 2 Attention (is something you might need too)","summary":" Soft attention in Transformer-based Large Language Models (LLMs) is\nsusceptible to incorporating irrelevant information from the context into its\nlatent representations, which adversely affects next token generations. To help\nrectify these issues, we introduce System 2 Attention (S2A), which leverages\nthe ability of LLMs to reason in natural language and follow instructions in\norder to decide what to attend to. S2A regenerates the input context to only\ninclude the relevant portions, before attending to the regenerated context to\nelicit the final response. In experiments, S2A outperforms standard\nattention-based LLMs on three tasks containing opinion or irrelevant\ninformation, QA, math word problems and longform generation, where S2A\nincreases factuality and objectivity, and decreases sycophancy.\n","authors":["Jason Weston","Sainbayar Sukhbaatar"],"pdf_url":"https://arxiv.org/pdf/2311.11829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08358v3","updated":"2023-11-20T14:52:43Z","published":"2022-11-15T18:06:53Z","title":"MEAL: Stable and Active Learning for Few-Shot Prompting","summary":" Few-shot classification has made great strides due to foundation models that,\nthrough priming and prompting, are highly effective few-shot learners. However,\nthis approach has high variance both across different sets of few shots (data\nselection) and across different finetuning runs (run variability). This is\nproblematic not only because it impedes the fair comparison of different\napproaches, but especially because it makes few-shot learning too unreliable\nfor many real-world applications. To alleviate these issues, we make two\ncontributions for more stable and effective few-shot learning: First, we\npropose novel ensembling methods and show that they substantially reduce run\nvariability. Second, we introduce a new active learning (AL) criterion for data\nselection and present the first AL-based approach specifically tailored towards\nprompt-based learning. In our experiments, we show that our combined method,\nMEAL (Multiprompt finetuning and prediction Ensembling with Active Learning),\nimproves overall performance of prompt-based finetuning by 2.3 points on five\ndiverse tasks. We publicly share our code and data splits in\nhttps://github.com/akoksal/MEAL.\n","authors":["Abdullatif Köksal","Timo Schick","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2211.08358v3.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.11813v1","updated":"2023-11-20T14:50:12Z","published":"2023-11-20T14:50:12Z","title":"Efficient Grammatical Error Correction Via Multi-Task Training and\n Optimized Training Schedule","summary":" Progress in neural grammatical error correction (GEC) is hindered by the lack\nof annotated training data. Sufficient amounts of high-quality manually\nannotated data are not available, so recent research has relied on generating\nsynthetic data, pretraining on it, and then fine-tuning on real datasets;\nperformance gains have been achieved either by ensembling or by using huge\npretrained models such as XXL-T5 as the backbone. In this work, we explore an\northogonal direction: how to use available data more efficiently. First, we\npropose auxiliary tasks that exploit the alignment between the original and\ncorrected sentences, such as predicting a sequence of corrections. We formulate\neach task as a sequence-to-sequence problem and perform multi-task training.\nSecond, we discover that the order of datasets used for training and even\nindividual instances within a dataset may have important effects on the final\nperformance, so we set out to find the best training schedule. Together, these\ntwo ideas lead to significant improvements, producing results that improve\nstate of the art with much smaller models; in particular, we outperform the\nbest models based on T5-XXL (11B parameters) with a BART-based model (400M\nparameters).\n","authors":["Andrey Bout","Alexander Podolskiy","Sergey Nikolenko","Irina Piontkovskaya"],"pdf_url":"https://arxiv.org/pdf/2311.11813v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.13302v2","updated":"2023-11-20T14:31:26Z","published":"2023-05-22T17:58:01Z","title":"Language-Agnostic Bias Detection in Language Models with Bias Probing","summary":" Pretrained language models (PLMs) are key components in NLP, but they contain\nstrong social biases. Quantifying these biases is challenging because current\nmethods focusing on fill-the-mask objectives are sensitive to slight changes in\ninput. To address this, we propose a bias probing technique called LABDet, for\nevaluating social bias in PLMs with a robust and language-agnostic method. For\nnationality as a case study, we show that LABDet `surfaces' nationality bias by\ntraining a classifier on top of a frozen PLM on non-nationality sentiment\ndetection. We find consistent patterns of nationality bias across monolingual\nPLMs in six languages that align with historical and political context. We also\nshow for English BERT that bias surfaced by LABDet correlates well with bias in\nthe pretraining data; thus, our work is one of the few studies that directly\nlinks pretraining data to PLM behavior. Finally, we verify LABDet's reliability\nand applicability to different templates and languages through an extensive set\nof robustness checks. We publicly share our code and dataset in\nhttps://github.com/akoksal/LABDet.\n","authors":["Abdullatif Köksal","Omer Faruk Yalcin","Ahmet Akbiyik","M. Tahir Kilavuz","Anna Korhonen","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2305.13302v2.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.11797v1","updated":"2023-11-20T14:30:55Z","published":"2023-11-20T14:30:55Z","title":"Igniting Language Intelligence: The Hitchhiker's Guide From\n Chain-of-Thought Reasoning to Language Agents","summary":" Large language models (LLMs) have dramatically enhanced the field of language\nintelligence, as demonstrably evidenced by their formidable empirical\nperformance across a spectrum of complex reasoning tasks. Additionally,\ntheoretical proofs have illuminated their emergent reasoning capabilities,\nproviding a compelling showcase of their advanced cognitive abilities in\nlinguistic contexts. Critical to their remarkable efficacy in handling complex\nreasoning tasks, LLMs leverage the intriguing chain-of-thought (CoT) reasoning\ntechniques, obliging them to formulate intermediate steps en route to deriving\nan answer. The CoT reasoning approach has not only exhibited proficiency in\namplifying reasoning performance but also in enhancing interpretability,\ncontrollability, and flexibility. In light of these merits, recent research\nendeavors have extended CoT reasoning methodologies to nurture the development\nof autonomous language agents, which adeptly adhere to language instructions\nand execute actions within varied environments. This survey paper orchestrates\na thorough discourse, penetrating vital research dimensions, encompassing: (i)\nthe foundational mechanics of CoT techniques, with a focus on elucidating the\ncircumstances and justification behind its efficacy; (ii) the paradigm shift in\nCoT; and (iii) the burgeoning of language agents fortified by CoT approaches.\nProspective research avenues envelop explorations into generalization,\nefficiency, customization, scaling, and safety. This paper caters to a wide\naudience, including beginners seeking comprehensive knowledge of CoT reasoning\nand language agents, as well as experienced researchers interested in\nfoundational mechanics and engaging in cutting-edge discussions on these\ntopics. A repository for the related papers is available at\nhttps://github.com/Zoeyyao27/CoT-Igniting-Agent.\n","authors":["Zhuosheng Zhang","Yao Yao","Aston Zhang","Xiangru Tang","Xinbei Ma","Zhiwei He","Yiming Wang","Mark Gerstein","Rui Wang","Gongshen Liu","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.11797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11796v1","updated":"2023-11-20T14:29:45Z","published":"2023-11-20T14:29:45Z","title":"Beyond Boundaries: A Comprehensive Survey of Transferable Attacks on AI\n Systems","summary":" Artificial Intelligence (AI) systems such as autonomous vehicles, facial\nrecognition, and speech recognition systems are increasingly integrated into\nour daily lives. However, despite their utility, these AI systems are\nvulnerable to a wide range of attacks such as adversarial, backdoor, data\npoisoning, membership inference, model inversion, and model stealing attacks.\nIn particular, numerous attacks are designed to target a particular model or\nsystem, yet their effects can spread to additional targets, referred to as\ntransferable attacks. Although considerable efforts have been directed toward\ndeveloping transferable attacks, a holistic understanding of the advancements\nin transferable attacks remains elusive. In this paper, we comprehensively\nexplore learning-based attacks from the perspective of transferability,\nparticularly within the context of cyber-physical security. We delve into\ndifferent domains -- the image, text, graph, audio, and video domains -- to\nhighlight the ubiquitous and pervasive nature of transferable attacks. This\npaper categorizes and reviews the architecture of existing attacks from various\nviewpoints: data, process, model, and system. We further examine the\nimplications of transferable attacks in practical scenarios such as autonomous\ndriving, speech recognition, and large language models (LLMs). Additionally, we\noutline the potential research directions to encourage efforts in exploring the\nlandscape of transferable attacks. This survey offers a holistic understanding\nof the prevailing transferable attacks and their impacts across different\ndomains.\n","authors":["Guangjing Wang","Ce Zhou","Yuanda Wang","Bocheng Chen","Hanqing Guo","Qiben Yan"],"pdf_url":"https://arxiv.org/pdf/2311.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15363v4","updated":"2023-11-20T13:59:16Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborate their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar. To\nexplore the potential of open-source LLM, we investigate them in various\nscenarios, and further enhance their performance with supervised fine-tuning.\nOur explorations highlight open-source LLMs' potential in Text-to-SQL, as well\nas the advantages and disadvantages of the supervised fine-tuning.\nAdditionally, towards an efficient and economic LLM-based Text-to-SQL solution,\nwe emphasize the token efficiency in prompt engineering and compare the prior\nstudies under this metric. We hope that our work provides a deeper\nunderstanding of Text-to-SQL with LLMs, and inspires further investigations and\nbroad applications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v4.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2309.10003v3","updated":"2023-11-20T13:31:47Z","published":"2023-09-17T16:50:07Z","title":"A novel approach to measuring patent claim scope based on probabilities\n obtained from (large) language models","summary":" This work proposes to measure the scope of a patent claim as the reciprocal\nof the self-information contained in this claim. A probability of occurrence of\nthe claim is obtained from a language model and this probability is used to\ncompute the self-information. Grounded in information theory, this approach is\nbased on the assumption that an unlikely concept is more informative than a\nusual concept, insofar as it is more surprising. In turn, the more surprising\nthe information required to defined the claim, the narrower its scope. Five\nlanguage models are considered, ranging from simplest models (each word or\ncharacter is assigned an identical probability) to intermediate models (using\naverage word or character frequencies), to a large language model (GPT2).\nInterestingly, the scope resulting from the simplest language models is\nproportional to the reciprocal of the number of words or characters involved in\nthe claim, a metric already used in previous works. Application is made to\nmultiple series of patent claims directed to distinct inventions, where each\nseries consists of claims devised to have a gradually decreasing scope. The\nperformance of the language models is assessed with respect to several ad hoc\ntests. The more sophisticated the model, the better the results. I.e., the GPT2\nprobability model outperforms models based on word and character frequencies,\nwhich themselves outdo the simplest models based on word or character counts.\nStill, the character count appears to be a more reliable indicator than the\nword count.\n","authors":["Sébastien Ragot"],"pdf_url":"https://arxiv.org/pdf/2309.10003v3.pdf","comment":"58 pages, 8 tables, 6 figures. Substantial changes made to version 2:\n New section 4.1 added (including a new table); Minor normalization issue\n corrected in values listed in Appendix B; Content of former appendix C now\n moved to Section 3; and new Appendix C added. Minor changes made to version 3\n (style, typos, language)"},{"id":"http://arxiv.org/abs/2311.11745v1","updated":"2023-11-20T13:13:24Z","published":"2023-11-20T13:13:24Z","title":"Encoding Speaker-Specific Latent Speech Feature for Speech Synthesis","summary":" In this work, we propose a novel method for modeling numerous speakers, which\nenables expressing the overall characteristics of speakers in detail like a\ntrained multi-speaker model without additional training on the target speaker's\ndataset. Although various works with similar purposes have been actively\nstudied, their performance has not yet reached that of trained multi-speaker\nmodels due to their fundamental limitations. To overcome previous limitations,\nwe propose effective methods for feature learning and representing target\nspeakers' speech characteristics by discretizing the features and conditioning\nthem to a speech synthesis model. Our method obtained a significantly higher\nsimilarity mean opinion score (SMOS) in subjective similarity evaluation than\nseen speakers of a best-performing multi-speaker model, even with unseen\nspeakers. The proposed method also outperforms a zero-shot method by\nsignificant margins. Furthermore, our method shows remarkable performance in\ngenerating new artificial speakers. In addition, we demonstrate that the\nencoded latent features are sufficiently informative to reconstruct an original\nspeaker's speech completely. It implies that our method can be used as a\ngeneral methodology to encode and reconstruct speakers' characteristics in\nvarious tasks.\n","authors":["Jungil Kong","Junmo Lee","Jeongmin Kim","Beomjeong Kim","Jihoon Park","Dohee Kong","Changheon Lee","Sangjin Kim"],"pdf_url":"https://arxiv.org/pdf/2311.11745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19130v2","updated":"2023-11-20T12:15:19Z","published":"2023-10-29T19:39:03Z","title":"Women Wearing Lipstick: Measuring the Bias Between an Object and Its\n Related Gender","summary":" In this paper, we investigate the impact of objects on gender bias in image\ncaptioning systems. Our results show that only gender-specific objects have a\nstrong gender bias (e.g., women-lipstick). In addition, we propose a visual\nsemantic-based gender score that measures the degree of bias and can be used as\na plug-in for any image captioning system. Our experiments demonstrate the\nutility of the gender score, since we observe that our score can measure the\nbias relation between a caption and its related gender; therefore, our score\ncan be used as an additional metric to the existing Object Gender Co-Occ\napproach. Code and data are publicly available at\n\\url{https://github.com/ahmedssabir/GenderScore}.\n","authors":["Ahmed Sabir","Lluís Padró"],"pdf_url":"https://arxiv.org/pdf/2310.19130v2.pdf","comment":"EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2311.11701v1","updated":"2023-11-20T12:08:32Z","published":"2023-11-20T12:08:32Z","title":"Control in Hybrid Chatbots","summary":" Customer data typically is held in database systems, which can be seen as\nrule-based knowledge base, whereas businesses increasingly want to benefit from\nthe capabilities of large, pre-trained language models.\n In this technical report, we describe a case study of how a commercial rule\nengine and an integrated neural chatbot may be integrated, and what level of\ncontrol that particular integration mode leads to. We also discuss alternative\nways (including past ways realized in other systems) how researchers strive to\nmaintain control and avoid what has recently been called model \"hallucination\".\n","authors":["Thomas Rüdel","Jochen L. Leidner"],"pdf_url":"https://arxiv.org/pdf/2311.11701v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.11696v1","updated":"2023-11-20T11:56:25Z","published":"2023-11-20T11:56:25Z","title":"Sparse Low-rank Adaptation of Pre-trained Language Models","summary":" Fine-tuning pre-trained large language models in a parameter-efficient manner\nis widely studied for its effectiveness and efficiency. The popular method of\nlow-rank adaptation (LoRA) offers a notable approach, hypothesizing that the\nadaptation process is intrinsically low-dimensional. Although LoRA has\ndemonstrated commendable performance, it is implemented with a fixed and\nunalterable intrinsic rank that might not always be the ideal choice.\nRecognizing the need for more flexible adaptation, we extend the methodology of\nLoRA to an innovative approach we call sparse low-rank adaptation (SoRA) that\nenables dynamic adjustments to the intrinsic rank during the adaptation\nprocess. We achieve this through the incorporation of a gate unit optimized\nwith proximal gradient method in the training stage, controlling the\ncardinality of rank under the sparsity of the gate. In the subsequent inference\nstage, we eliminate the parameter blocks corresponding to the zeroed-out ranks,\nto reduce each SoRA module back to a concise yet rank-optimal LoRA. Our\napproach strengthens the representation power of LoRA by initializing it with a\nhigher rank, while efficiently taming a temporarily increased number of\nparameters via updating in a sparse way. We further introduce a sparsifying\nscheduler for SoRA, aiming to examine the impact of the number of non-zero\nparameters on the model's memorization and generalization. Our experimental\nresults demonstrate that SoRA can outperform other baselines even with 70%\nretained parameters and 70% training time.\n","authors":["Ning Ding","Xingtai Lv","Qiaosen Wang","Yulin Chen","Bowen Zhou","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2311.11696v1.pdf","comment":"Accepted to EMNLP 2023 (Main Conference)"},{"id":"http://arxiv.org/abs/2311.11690v1","updated":"2023-11-20T11:43:45Z","published":"2023-11-20T11:43:45Z","title":"Refactoring Programs Using Large Language Models with Few-Shot Examples","summary":" A less complex and more straightforward program is a crucial factor that\nenhances its maintainability and makes writing secure and bug-free programs\neasier. However, due to its heavy workload and the risks of breaking the\nworking programs, programmers are reluctant to do code refactoring, and thus,\nit also causes the loss of potential learning experiences. To mitigate this, we\ndemonstrate the application of using a large language model (LLM), GPT-3.5, to\nsuggest less complex versions of the user-written Python program, aiming to\nencourage users to learn how to write better programs. We propose a method to\nleverage the prompting with few-shot examples of the LLM by selecting the\nbest-suited code refactoring examples for each target programming problem based\non the prior evaluation of prompting with the one-shot example. The\nquantitative evaluation shows that 95.68% of programs can be refactored by\ngenerating 10 candidates each, resulting in a 17.35% reduction in the average\ncyclomatic complexity and a 25.84% decrease in the average number of lines\nafter filtering only generated programs that are semantically correct.\nFurthermore, the qualitative evaluation shows outstanding capability in code\nformatting, while unnecessary behaviors such as deleting or translating\ncomments are also observed.\n","authors":["Atsushi Shirafuji","Yusuke Oda","Jun Suzuki","Makoto Morishita","Yutaka Watanobe"],"pdf_url":"https://arxiv.org/pdf/2311.11690v1.pdf","comment":"10 pages, 10 figures, accepted to the 30th Asia-Pacific Software\n Engineering Conference (APSEC 2023)"},{"id":"http://arxiv.org/abs/2310.10348v2","updated":"2023-11-20T11:31:16Z","published":"2023-10-16T12:34:43Z","title":"Attribution Patching Outperforms Automated Circuit Discovery","summary":" Automated interpretability research has recently attracted attention as a\npotential research direction that could scale explanations of neural network\nbehavior to large models. Existing automated circuit discovery work applies\nactivation patching to identify subnetworks responsible for solving specific\ntasks (circuits). In this work, we show that a simple method based on\nattribution patching outperforms all existing methods while requiring just two\nforward passes and a backward pass. We apply a linear approximation to\nactivation patching to estimate the importance of each edge in the\ncomputational subgraph. Using this approximation, we prune the least important\nedges of the network. We survey the performance and limitations of this method,\nfinding that averaged over all tasks our method has greater AUC from circuit\nrecovery than other methods.\n","authors":["Aaquib Syed","Can Rager","Arthur Conmy"],"pdf_url":"https://arxiv.org/pdf/2310.10348v2.pdf","comment":"6 main paper pages, 6 additional pages. NeurIPS 2023 ATTRIB Workshop"},{"id":"http://arxiv.org/abs/2311.11608v1","updated":"2023-11-20T08:51:30Z","published":"2023-11-20T08:51:30Z","title":"Taiyi: A Bilingual Fine-Tuned Large Language Model for Diverse\n Biomedical Tasks","summary":" Recent advancements in large language models (LLMs) have shown promising\nresults across a variety of natural language processing (NLP) tasks. The\napplication of LLMs to specific domains, such as biomedicine, has achieved\nincreased attention. However, most biomedical LLMs focus on enhancing\nperformance in monolingual biomedical question answering and conversation\ntasks. To further investigate the effectiveness of the LLMs on diverse\nbiomedical NLP tasks in different languages, we present Taiyi, a bilingual\n(English and Chinese) fine-tuned LLM for diverse biomedical tasks. In this\nwork, we first curated a comprehensive collection of 140 existing biomedical\ntext mining datasets across over 10 task types. Subsequently, a two-stage\nstrategy is proposed for supervised fine-tuning to optimize the model\nperformance across varied tasks. Experimental results on 13 test sets covering\nnamed entity recognition, relation extraction, text classification, question\nanswering tasks demonstrate Taiyi achieves superior performance compared to\ngeneral LLMs. The case study involving additional biomedical NLP tasks further\nshows Taiyi's considerable potential for bilingual biomedical multi-tasking.\nThe source code, datasets, and model for Taiyi are freely available at\nhttps://github.com/DUTIR-BioNLP/Taiyi-LLM.\n","authors":["Ling Luo","Jinzhong Ning","Yingwen Zhao","Zhijun Wang","Zeyuan Ding","Peng Chen","Weiru Fu","Qinyu Han","Guangtao Xu","Yunzhi Qiu","Dinghao Pan","Jiru Li","Hao Li","Wenduo Feng","Senbo Tu","Yuqi Liu","Zhihao Yang","Jian Wang","Yuanyuan Sun","Hongfei Lin"],"pdf_url":"https://arxiv.org/pdf/2311.11608v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11601v1","updated":"2023-11-20T08:29:52Z","published":"2023-11-20T08:29:52Z","title":"Addressing the Length Bias Problem in Document-Level Neural Machine\n Translation","summary":" Document-level neural machine translation (DNMT) has shown promising results\nby incorporating more context information. However, this approach also\nintroduces a length bias problem, whereby DNMT suffers from significant\ntranslation quality degradation when decoding documents that are much shorter\nor longer than the maximum sequence length during training. %i.e., the length\nbias problem. To solve the length bias problem, we propose to improve the DNMT\nmodel in training method, attention mechanism, and decoding strategy. Firstly,\nwe propose to sample the training data dynamically to ensure a more uniform\ndistribution across different sequence lengths. Then, we introduce a\nlength-normalized attention mechanism to aid the model in focusing on target\ninformation, mitigating the issue of attention divergence when processing\nlonger sequences. Lastly, we propose a sliding window strategy during decoding\nthat integrates as much context information as possible without exceeding the\nmaximum sequence length. The experimental results indicate that our method can\nbring significant improvements on several open datasets, and further analysis\nshows that our method can significantly alleviate the length bias problem.\n","authors":["Zhuocheng Zhang","Shuhao Gu","Min Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2311.11601v1.pdf","comment":"Accepted by EMNLP2023 Findings"},{"id":"http://arxiv.org/abs/2311.11598v1","updated":"2023-11-20T08:23:39Z","published":"2023-11-20T08:23:39Z","title":"Filling the Image Information Gap for VQA: Prompting Large Language\n Models to Proactively Ask Questions","summary":" Large Language Models (LLMs) demonstrate impressive reasoning ability and the\nmaintenance of world knowledge not only in natural language tasks, but also in\nsome vision-language tasks such as open-domain knowledge-based visual question\nanswering (OK-VQA). As images are invisible to LLMs, researchers convert images\nto text to engage LLMs into the visual question reasoning procedure. This leads\nto discrepancies between images and their textual representations presented to\nLLMs, which consequently impedes final reasoning performance. To fill the\ninformation gap and better leverage the reasoning capability, we design a\nframework that enables LLMs to proactively ask relevant questions to unveil\nmore details in the image, along with filters for refining the generated\ninformation. We validate our idea on OK-VQA and A-OKVQA. Our method\ncontinuously boosts the performance of baselines methods by an average gain of\n2.15% on OK-VQA, and achieves consistent improvements across different LLMs.\n","authors":["Ziyue Wang","Chi Chen","Peng Li","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.11598v1.pdf","comment":"Accepted to EMNLP2023 Findings"},{"id":"http://arxiv.org/abs/2311.11583v1","updated":"2023-11-20T07:41:30Z","published":"2023-11-20T07:41:30Z","title":"How well ChatGPT understand Malaysian English? An Evaluation on Named\n Entity Recognition and Relation Extraction","summary":" Recently, ChatGPT has attracted a lot of interest from both researchers and\nthe general public. While the performance of ChatGPT in named entity\nrecognition and relation extraction from Standard English texts is\nsatisfactory, it remains to be seen if it can perform similarly for Malaysian\nEnglish. Malaysian English is unique as it exhibits morphosyntactic and\nsemantical adaptation from local contexts. In this study, we assess ChatGPT's\ncapability in extracting entities and relations from the Malaysian English News\n(MEN) dataset. We propose a three-step methodology referred to as\n\\textbf{\\textit{educate-predict-evaluate}}. The performance of ChatGPT is\nassessed using F1-Score across 18 unique prompt settings, which were carefully\nengineered for a comprehensive review. From our evaluation, we found that\nChatGPT does not perform well in extracting entities from Malaysian English\nnews articles, with the highest F1-Score of 0.497. Further analysis shows that\nthe morphosyntactic adaptation in Malaysian English caused the limitation.\nHowever, interestingly, this morphosyntactic adaptation does not impact the\nperformance of ChatGPT for relation extraction.\n","authors":["Mohan Raj Chanthran","Lay-Ki Soon","Huey Fang Ong","Bhawani Selvaretnam"],"pdf_url":"https://arxiv.org/pdf/2311.11583v1.pdf","comment":"Accepted in Generation, Evaluation & Metrics (GEM) Workshop at EMNLP\n 2023"},{"id":"http://arxiv.org/abs/2311.11564v1","updated":"2023-11-20T07:02:35Z","published":"2023-11-20T07:02:35Z","title":"KBioXLM: A Knowledge-anchored Biomedical Multilingual Pretrained\n Language Model","summary":" Most biomedical pretrained language models are monolingual and cannot handle\nthe growing cross-lingual requirements. The scarcity of non-English domain\ncorpora, not to mention parallel data, poses a significant hurdle in training\nmultilingual biomedical models. Since knowledge forms the core of\ndomain-specific corpora and can be translated into various languages\naccurately, we propose a model called KBioXLM, which transforms the\nmultilingual pretrained model XLM-R into the biomedical domain using a\nknowledge-anchored approach. We achieve a biomedical multilingual corpus by\nincorporating three granularity knowledge alignments (entity, fact, and passage\nlevels) into monolingual corpora. Then we design three corresponding training\ntasks (entity masking, relation masking, and passage relation prediction) and\ncontinue training on top of the XLM-R model to enhance its domain cross-lingual\nability. To validate the effectiveness of our model, we translate the English\nbenchmarks of multiple tasks into Chinese. Experimental results demonstrate\nthat our model significantly outperforms monolingual and multilingual\npretrained models in cross-lingual zero-shot and few-shot scenarios, achieving\nimprovements of up to 10+ points. Our code is publicly available at\nhttps://github.com/ngwlh-gl/KBioXLM.\n","authors":["Lei Geng","Xu Yan","Ziqiang Cao","Juntao Li","Wenjie Li","Sujian Li","Xinjie Zhou","Yang Yang","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.11564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11552v1","updated":"2023-11-20T06:06:22Z","published":"2023-11-20T06:06:22Z","title":"Exploring Prompting Large Language Models as Explainable Metrics","summary":" This paper describes the IUST NLP Lab submission to the Prompting Large\nLanguage Models as Explainable Metrics Shared Task at the Eval4NLP 2023\nWorkshop on Evaluation & Comparison of NLP Systems. We have proposed a\nzero-shot prompt-based strategy for explainable evaluation of the summarization\ntask using Large Language Models (LLMs). The conducted experiments demonstrate\nthe promising potential of LLMs as evaluation metrics in Natural Language\nProcessing (NLP), particularly in the field of summarization. Both few-shot and\nzero-shot approaches are employed in these experiments. The performance of our\nbest provided prompts achieved a Kendall correlation of 0.477 with human\nevaluations in the text summarization task on the test data. Code and results\nare publicly available on GitHub.\n","authors":["Ghazaleh Mahmoudi"],"pdf_url":"https://arxiv.org/pdf/2311.11552v1.pdf","comment":"9 pages, Eval4NLP 2023"},{"id":"http://arxiv.org/abs/2311.11551v1","updated":"2023-11-20T06:06:20Z","published":"2023-11-20T06:06:20Z","title":"Adapt in Contexts: Retrieval-Augmented Domain Adaptation via In-Context\n Learning","summary":" Large language models (LLMs) have showcased their capability with few-shot\ninference known as in-context learning. However, in-domain demonstrations are\nnot always readily available in real scenarios, leading to cross-domain\nin-context learning. Besides, LLMs are still facing challenges in long-tail\nknowledge in unseen and unfamiliar domains. The above limitations demonstrate\nthe necessity of Unsupervised Domain Adaptation (UDA). In this paper, we study\nthe UDA problem under an in-context learning setting to adapt language models\nfrom the source domain to the target domain without any target labels. The core\nidea is to retrieve a subset of cross-domain elements that are the most similar\nto the query, and elicit language model to adapt in an in-context manner by\nlearning both target domain distribution and the discriminative task signal\nsimultaneously with the augmented cross-domain in-context examples. We devise\ndifferent prompting and training strategies, accounting for different LM\narchitectures to learn the target distribution via language modeling. With\nextensive experiments on Sentiment Analysis (SA) and Named Entity Recognition\n(NER) tasks, we thoroughly study the effectiveness of ICL for domain transfer\nand demonstrate significant improvements over baseline models.\n","authors":["Quanyu Long","Wenya Wang","Sinno Jialin Pan"],"pdf_url":"https://arxiv.org/pdf/2311.11551v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2205.15439v2","updated":"2023-11-20T04:31:13Z","published":"2022-05-30T21:34:40Z","title":"StyleTTS: A Style-Based Generative Model for Natural and Diverse\n Text-to-Speech Synthesis","summary":" Text-to-Speech (TTS) has recently seen great progress in synthesizing\nhigh-quality speech owing to the rapid development of parallel TTS systems, but\nproducing speech with naturalistic prosodic variations, speaking styles and\nemotional tones remains challenging. Moreover, since duration and speech are\ngenerated separately, parallel TTS models still have problems finding the best\nmonotonic alignments that are crucial for naturalistic speech synthesis. Here,\nwe propose StyleTTS, a style-based generative model for parallel TTS that can\nsynthesize diverse speech with natural prosody from a reference speech\nutterance. With novel Transferable Monotonic Aligner (TMA) and\nduration-invariant data augmentation schemes, our method significantly\noutperforms state-of-the-art models on both single and multi-speaker datasets\nin subjective tests of speech naturalness and speaker similarity. Through\nself-supervised learning of the speaking styles, our model can synthesize\nspeech with the same prosodic and emotional tone as any given reference speech\nwithout the need for explicitly labeling these categories.\n","authors":["Yinghao Aaron Li","Cong Han","Nima Mesgarani"],"pdf_url":"https://arxiv.org/pdf/2205.15439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07691v2","updated":"2023-11-20T04:23:08Z","published":"2023-06-13T11:04:43Z","title":"StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion\n and Adversarial Training with Large Speech Language Models","summary":" In this paper, we present StyleTTS 2, a text-to-speech (TTS) model that\nleverages style diffusion and adversarial training with large speech language\nmodels (SLMs) to achieve human-level TTS synthesis. StyleTTS 2 differs from its\npredecessor by modeling styles as a latent random variable through diffusion\nmodels to generate the most suitable style for the text without requiring\nreference speech, achieving efficient latent diffusion while benefiting from\nthe diverse speech synthesis offered by diffusion models. Furthermore, we\nemploy large pre-trained SLMs, such as WavLM, as discriminators with our novel\ndifferentiable duration modeling for end-to-end training, resulting in improved\nspeech naturalness. StyleTTS 2 surpasses human recordings on the single-speaker\nLJSpeech dataset and matches it on the multispeaker VCTK dataset as judged by\nnative English speakers. Moreover, when trained on the LibriTTS dataset, our\nmodel outperforms previous publicly available models for zero-shot speaker\nadaptation. This work achieves the first human-level TTS on both single and\nmultispeaker datasets, showcasing the potential of style diffusion and\nadversarial training with large SLMs. The audio demos and source code are\navailable at https://styletts2.github.io/.\n","authors":["Yinghao Aaron Li","Cong Han","Vinay S. Raghavan","Gavin Mischler","Nima Mesgarani"],"pdf_url":"https://arxiv.org/pdf/2306.07691v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2304.13867v3","updated":"2023-11-20T04:09:25Z","published":"2023-04-26T23:24:50Z","title":"Transferring Procedural Knowledge across Commonsense Tasks","summary":" Stories about everyday situations are an essential part of human\ncommunication, motivating the need to develop AI agents that can reliably\nunderstand these stories. Despite the long list of supervised methods for story\ncompletion and procedural understanding, current AI has no mechanisms to\nautomatically track and explain procedures in unseen stories. To bridge this\ngap, we study the ability of AI models to transfer procedural knowledge to\nnovel narrative tasks in a transparent manner. We design LEAP: a comprehensive\nframework that integrates state-of-the-art modeling architectures, training\nregimes, and augmentation strategies based on both natural and synthetic\nstories. To address the lack of densely annotated training data, we devise a\nrobust automatic labeler based on few-shot prompting to enhance the augmented\ndata. Our experiments with in- and out-of-domain tasks reveal insights into the\ninterplay of different architectures, training regimes, and augmentation\nstrategies. LEAP's labeler has a clear positive impact on out-of-domain\ndatasets, while the resulting dense annotation provides native explainability.\n","authors":["Yifan Jiang","Filip Ilievski","Kaixin Ma"],"pdf_url":"https://arxiv.org/pdf/2304.13867v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11518v1","updated":"2023-11-20T03:44:32Z","published":"2023-11-20T03:44:32Z","title":"Multi-teacher Distillation for Multilingual Spelling Correction","summary":" Accurate spelling correction is a critical step in modern search interfaces,\nespecially in an era of mobile devices and speech-to-text interfaces. For\nservices that are deployed around the world, this poses a significant challenge\nfor multilingual NLP: spelling errors need to be caught and corrected in all\nlanguages, and even in queries that use multiple languages. In this paper, we\ntackle this challenge using multi-teacher distillation. On our approach, a\nmonolingual teacher model is trained for each language/locale, and these\nindividual models are distilled into a single multilingual student model\nintended to serve all languages/locales. In experiments using open-source data\nas well as user data from a worldwide search service, we show that this leads\nto highly effective spelling correction models that can meet the tight latency\nrequirements of deployed services.\n","authors":["Jingfen Zhang","Xuan Guo","Sravan Bodapati","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2311.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11516v1","updated":"2023-11-20T03:42:24Z","published":"2023-11-20T03:42:24Z","title":"GPT in Data Science: A Practical Exploration of Model Selection","summary":" There is an increasing interest in leveraging Large Language Models (LLMs)\nfor managing structured data and enhancing data science processes. Despite the\npotential benefits, this integration poses significant questions regarding\ntheir reliability and decision-making methodologies. It highlights the\nimportance of various factors in the model selection process, including the\nnature of the data, problem type, performance metrics, computational resources,\ninterpretability vs accuracy, assumptions about data, and ethical\nconsiderations. Our objective is to elucidate and express the factors and\nassumptions guiding GPT-4's model selection recommendations. We employ a\nvariability model to depict these factors and use toy datasets to evaluate both\nthe model and the implementation of the identified heuristics. By contrasting\nthese outcomes with heuristics from other platforms, our aim is to determine\nthe effectiveness and distinctiveness of GPT-4's methodology. This research is\ncommitted to advancing our comprehension of AI decision-making processes,\nespecially in the realm of model selection within data science. Our efforts are\ndirected towards creating AI systems that are more transparent and\ncomprehensible, contributing to a more responsible and efficient practice in\ndata science.\n","authors":["Nathalia Nascimento","Cristina Tavares","Paulo Alencar","Donald Cowan"],"pdf_url":"https://arxiv.org/pdf/2311.11516v1.pdf","comment":"11 pages. To appear in IEEE BigData 2023"},{"id":"http://arxiv.org/abs/2310.09590v2","updated":"2023-11-20T03:29:23Z","published":"2023-10-14T14:23:44Z","title":"Solving Math Word Problems with Reexamination","summary":" Math word problem (MWP) solving aims to understand the descriptive math\nproblem and calculate the result, for which previous efforts are mostly devoted\nto upgrade different technical modules. This paper brings a different\nperspective of \\textit{reexamination process} during training by introducing a\npseudo-dual task to enhance the MWP solving. We propose a pseudo-dual (PseDual)\nlearning scheme to model such process, which is model-agnostic thus can be\nadapted to any existing MWP solvers. The pseudo-dual task is specifically\ndefined as filling the numbers in the expression back into the original word\nproblem with numbers masked. To facilitate the effective joint learning of the\ntwo tasks, we further design a scheduled fusion strategy for the number\ninfilling task, which smoothly switches the input from the ground-truth math\nexpressions to the predicted ones. Our pseudo-dual learning scheme has been\ntested and proven effective when being equipped in several representative MWP\nsolvers through empirical studies. \\textit{The codes and trained models are\navailable at:} \\url{https://github.com/steven640pixel/PsedualMWP}.\n\\end{abstract}\n","authors":["Yi Bin","Wenhao Shi","Yujuan Ding","Yang Yang","See-Kiong Ng"],"pdf_url":"https://arxiv.org/pdf/2310.09590v2.pdf","comment":"To be appeared at NeurIPS2023 Workshop on MATH-AI"},{"id":"http://arxiv.org/abs/2311.11509v1","updated":"2023-11-20T03:17:21Z","published":"2023-11-20T03:17:21Z","title":"Token-Level Adversarial Prompt Detection Based on Perplexity Measures\n and Contextual Information","summary":" In recent years, Large Language Models (LLM) have emerged as pivotal tools in\nvarious applications. However, these models are susceptible to adversarial\nprompt attacks, where attackers can carefully curate input strings that lead to\nundesirable outputs. The inherent vulnerability of LLMs stems from their\ninput-output mechanisms, especially when presented with intensely\nout-of-distribution (OOD) inputs. This paper proposes a token-level detection\nmethod to identify adversarial prompts, leveraging the LLM's capability to\npredict the next token's probability. We measure the degree of the model's\nperplexity and incorporate neighboring token information to encourage the\ndetection of contiguous adversarial prompt sequences. As a result, we propose\ntwo methods: one that identifies each token as either being part of an\nadversarial prompt or not, and another that estimates the probability of each\ntoken being part of an adversarial prompt.\n","authors":["Zhengmian Hu","Gang Wu","Saayan Mitra","Ruiyi Zhang","Tong Sun","Heng Huang","Vishy Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2311.11509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10702v2","updated":"2023-11-20T02:01:33Z","published":"2023-11-17T18:45:45Z","title":"Camels in a Changing Climate: Enhancing LM Adaptation with Tulu 2","summary":" Since the release of T\\\"ULU [Wang et al., 2023b], open resources for\ninstruction tuning have developed quickly, from better base models to new\nfinetuning techniques. We test and incorporate a number of these advances into\nT\\\"ULU, resulting in T\\\"ULU 2, a suite of improved T\\\"ULU models for advancing\nthe understanding and best practices of adapting pretrained language models to\ndownstream tasks and user preferences. Concretely, we release: (1)\nT\\\"ULU-V2-mix, an improved collection of high-quality instruction datasets; (2)\nT\\\"ULU 2, LLAMA-2 models finetuned on the V2 mixture; (3) T\\\"ULU 2+DPO, T\\\"ULU\n2 models trained with direct preference optimization (DPO), including the\nlargest DPO-trained model to date (T\\\"ULU 2+DPO 70B); (4) CODE T\\\"ULU 2, CODE\nLLAMA models finetuned on our V2 mix that outperform CODE LLAMA and its\ninstruction-tuned variant, CODE LLAMA-Instruct. Our evaluation from multiple\nperspectives shows that the T\\\"ULU 2 suite achieves state-of-the-art\nperformance among open models and matches or exceeds the performance of\nGPT-3.5-turbo-0301 on several benchmarks. We release all the checkpoints, data,\ntraining and evaluation code to facilitate future open efforts on adapting\nlarge language models.\n","authors":["Hamish Ivison","Yizhong Wang","Valentina Pyatkin","Nathan Lambert","Matthew Peters","Pradeep Dasigi","Joel Jang","David Wadden","Noah A. Smith","Iz Beltagy","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2311.10702v2.pdf","comment":"technical report; fixed zephyr numbers"},{"id":"http://arxiv.org/abs/2311.11482v1","updated":"2023-11-20T01:51:13Z","published":"2023-11-20T01:51:13Z","title":"Meta Prompting for AGI Systems","summary":" This paper presents an in-depth exploration of Meta Prompting, a novel\ntechnique that revolutionizes the way large language models (LLMs), multi-modal\nfoundation models, and AI systems approach problem-solving and data\ninterpretation. Meta Prompting, rooted in type theory and category theory,\nprioritizes the structure and syntax of information, providing a unique\nframework that transcends traditional content-focused methods. We delve into\nthe formal definitions of Meta Prompting, contrasting it with Few-Shot\nPrompting, and highlight its applicability and superiority in various AI\napplications.\n Key to this exploration is the expansion of Meta Prompting into the realm of\ncomplex reasoning. Here, we demonstrate how this technique adeptly breaks down\nintricate problems into manageable sub-problems, facilitating a step-by-step,\ndetailed approach to problem-solving. This method proves especially\nadvantageous in terms of token efficiency and offering a fair comparison in\nproblem-solving scenarios, standing out against few-shot example approaches.\n Furthermore, the paper breaks new ground by extending Meta Prompting into\nmulti-modal foundation model settings. This extension addresses the integration\nof diverse data types, such as images, audio, and video, within the structured\nframework of Meta Prompting, highlighting both the challenges and the vast\npotential of this approach in handling complex, multi-faceted data (The code is\navailable at https://github.com/meta-prompting/meta-prompting).\n","authors":["Yifan Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.11482v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05619v2","updated":"2023-11-20T01:24:40Z","published":"2023-09-11T17:07:01Z","title":"Effective Proxy for Human Labeling: Ensemble Disagreement Scores in\n Large Language Models for Industrial NLP","summary":" Large language models (LLMs) have demonstrated significant capability to\ngeneralize across a large number of NLP tasks. For industry applications, it is\nimperative to assess the performance of the LLM on unlabeled production data\nfrom time to time to validate for a real-world setting. Human labeling to\nassess model error requires considerable expense and time delay. Here we\ndemonstrate that ensemble disagreement scores work well as a proxy for human\nlabeling for language models in zero-shot, few-shot, and fine-tuned settings,\nper our evaluation on keyphrase extraction (KPE) task. We measure fidelity of\nthe results by comparing to true error measured from human labeled ground\ntruth. We contrast with the alternative of using another LLM as a source of\nmachine labels, or silver labels. Results across various languages and domains\nshow disagreement scores provide a better estimation of model performance with\nmean average error (MAE) as low as 0.4% and on average 13.8% better than using\nsilver labels.\n","authors":["Wei Du","Laksh Advani","Yashmeet Gambhir","Daniel J Perry","Prashant Shiralkar","Zhengzheng Xing","Aaron Colak"],"pdf_url":"https://arxiv.org/pdf/2309.05619v2.pdf","comment":"Camera ready version for 2023 EMNLP (The Third Workshop on Natural\n Language Generation, Evaluation, and Metrics (GEM))"},{"id":"http://arxiv.org/abs/2305.16300v2","updated":"2023-11-20T01:16:17Z","published":"2023-05-25T17:53:42Z","title":"Landmark Attention: Random-Access Infinite Context Length for\n Transformers","summary":" While Transformers have shown remarkable success in natural language\nprocessing, their attention mechanism's large memory requirements have limited\ntheir ability to handle longer contexts. Prior approaches, such as recurrent\nmemory or retrieval-based augmentation, have either compromised the\nrandom-access flexibility of attention (i.e., the capability to select any\ntoken in the entire context) or relied on separate mechanisms for relevant\ncontext retrieval, which may not be compatible with the model's attention. In\nthis paper, we present a novel approach that allows access to the complete\ncontext while retaining random-access flexibility, closely resembling running\nattention on the entire context. Our method uses a landmark token to represent\neach block of the input and trains the attention to use it for selecting\nrelevant blocks, enabling retrieval of blocks directly through the attention\nmechanism instead of by relying on a separate mechanism. Our approach\nseamlessly integrates with specialized data structures and the system's memory\nhierarchy, enabling processing of arbitrarily long context lengths. We\ndemonstrate that our method can obtain comparable performance with\nTransformer-XL while significantly reducing the number of retrieved tokens in\neach step. Finally, we show that fine-tuning LLaMA 7B with our method\nsuccessfully extends its context length capacity to over 32k tokens, allowing\nfor inference at the context lengths of GPT-4. We release the implementation of\nlandmark attention and the code to reproduce our experiments at\nhttps://github.com/epfml/landmark-attention/.\n","authors":["Amirkeivan Mohtashami","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2305.16300v2.pdf","comment":"Published as a conference paper at NeurIPS 2023 - 37th Conference on\n Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2311.11477v1","updated":"2023-11-20T01:07:30Z","published":"2023-11-20T01:07:30Z","title":"What's left can't be right -- The remaining positional incompetence of\n contrastive vision-language models","summary":" Contrastive vision-language models like CLIP have been found to lack spatial\nunderstanding capabilities. In this paper we discuss the possible causes of\nthis phenomenon by analysing both datasets and embedding space. By focusing on\nsimple left-right positional relations, we show that this behaviour is entirely\npredictable, even with large-scale datasets, demonstrate that these relations\ncan be taught using synthetic data and show that this approach can generalise\nwell to natural images - improving the performance on left-right relations on\nVisual Genome Relations.\n","authors":["Nils Hoehing","Ellen Rushe","Anthony Ventresque"],"pdf_url":"https://arxiv.org/pdf/2311.11477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12233v1","updated":"2023-11-20T23:17:20Z","published":"2023-11-20T23:17:20Z","title":"Unifying Corroborative and Contributive Attributions in Large Language\n Models","summary":" As businesses, products, and services spring up around large language models,\nthe trustworthiness of these models hinges on the verifiability of their\noutputs. However, methods for explaining language model outputs largely fall\nacross two distinct fields of study which both use the term \"attribution\" to\nrefer to entirely separate techniques: citation generation and training data\nattribution. In many modern applications, such as legal document generation and\nmedical question answering, both types of attributions are important. In this\nwork, we argue for and present a unified framework of large language model\nattributions. We show how existing methods of different types of attribution\nfall under the unified framework. We also use the framework to discuss\nreal-world use cases where one or both types of attributions are required. We\nbelieve that this unified framework will guide the use case driven development\nof systems that leverage both types of attribution, as well as the\nstandardization of their evaluation.\n","authors":["Theodora Worledge","Judy Hanwen Shen","Nicole Meister","Caleb Winston","Carlos Guestrin"],"pdf_url":"https://arxiv.org/pdf/2311.12233v1.pdf","comment":"NeurIPS ATTRIB Workshop 2023"},{"id":"http://arxiv.org/abs/2307.03172v3","updated":"2023-11-20T23:09:34Z","published":"2023-07-06T17:54:11Z","title":"Lost in the Middle: How Language Models Use Long Contexts","summary":" While recent language models have the ability to take long contexts as input,\nrelatively little is known about how well they use longer context. We analyze\nthe performance of language models on two tasks that require identifying\nrelevant information in their input contexts: multi-document question answering\nand key-value retrieval. We find that performance can degrade significantly\nwhen changing the position of relevant information, indicating that current\nlanguage models do not robustly make use of information in long input contexts.\nIn particular, we observe that performance is often highest when relevant\ninformation occurs at the beginning or end of the input context, and\nsignificantly degrades when models must access relevant information in the\nmiddle of long contexts, even for explicitly long-context models. Our analysis\nprovides a better understanding of how language models use their input context\nand provides new evaluation protocols for future long-context language models.\n","authors":["Nelson F. Liu","Kevin Lin","John Hewitt","Ashwin Paranjape","Michele Bevilacqua","Fabio Petroni","Percy Liang"],"pdf_url":"https://arxiv.org/pdf/2307.03172v3.pdf","comment":"18 pages, 16 figures. Accepted for publication in Transactions of the\n Association for Computational Linguistics (TACL), 2023"},{"id":"http://arxiv.org/abs/2311.12179v1","updated":"2023-11-20T20:48:25Z","published":"2023-11-20T20:48:25Z","title":"Leveraging Closed-Access Multilingual Embedding for Automatic Sentence\n Alignment in Low Resource Languages","summary":" The importance of qualitative parallel data in machine translation has long\nbeen determined but it has always been very difficult to obtain such in\nsufficient quantity for the majority of world languages, mainly because of the\nassociated cost and also the lack of accessibility to these languages. Despite\nthe potential for obtaining parallel datasets from online articles using\nautomatic approaches, forensic investigations have found a lot of\nquality-related issues such as misalignment, and wrong language codes. In this\nwork, we present a simple but qualitative parallel sentence aligner that\ncarefully leveraged the closed-access Cohere multilingual embedding, a solution\nthat ranked second in the just concluded #CoHereAIHack 2023 Challenge (see\nhttps://ai6lagos.devpost.com). The proposed approach achieved $94.96$ and\n$54.83$ f1 scores on FLORES and MAFAND-MT, compared to $3.64$ and $0.64$ of\nLASER respectively. Our method also achieved an improvement of more than 5 BLEU\nscores over LASER, when the resulting datasets were used with MAFAND-MT dataset\nto train translation models. Our code and data are available for research\npurposes here (https://github.com/abumafrim/Cohere-Align).\n","authors":["Idris Abdulmumin","Auwal Abubakar Khalid","Shamsuddeen Hassan Muhammad","Ibrahim Said Ahmad","Lukman Jibril Aliyu","Babangida Sani","Bala Mairiga Abduljalil","Sani Ahmad Hassan"],"pdf_url":"https://arxiv.org/pdf/2311.12179v1.pdf","comment":"To appear in the proceedings of ICCAIT 2023. 6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.06233v3","updated":"2023-11-20T20:43:10Z","published":"2023-11-10T18:48:58Z","title":"Data Contamination Quiz: A Tool to Detect and Estimate Contamination in\n Large Language Models","summary":" We propose the Data Contamination Quiz, a simple and effective approach to\ndetect data contamination in large language models (LLMs) and estimate the\namount of it. Specifically, we frame data contamination detection as a series\nof multiple-choice questions. We devise a quiz format wherein three perturbed\nversions of each dataset instance are created. These changes only include\nword-level perturbations, replacing words with their contextual synonyms,\nensuring both the semantic and sentence structure remain exactly the same as\nthe original instance. Together with the original instance, these perturbed\nversions constitute the choices in the quiz. Given that the only distinguishing\nsignal among these choices is the exact wording, an LLM, when tasked with\nidentifying the original instance from the choices, opts for the original if it\nhas memorized it in its pre-training phase--a trait intrinsic to LLMs. A\ndataset partition is then marked as contaminated if the LLM's performance on\nthe quiz surpasses what random chance suggests. Our evaluation spans seven\ndatasets and their respective splits (train and test/validation) on two\nstate-of-the-art LLMs: GPT-4 and GPT-3.5. While lacking access to the\npre-training data, our results suggest that our approach not only enhances the\ndetection of data contamination but also provides an accurate estimation of its\nextent, even when the contamination signal is weak.\n","authors":["Shahriar Golchin","Mihai Surdeanu"],"pdf_url":"https://arxiv.org/pdf/2311.06233v3.pdf","comment":"v1.2 preprint"},{"id":"http://arxiv.org/abs/2301.02748v4","updated":"2023-11-20T19:45:49Z","published":"2023-01-06T23:34:52Z","title":"Generative Antibody Design for Complementary Chain Pairing Sequences\n through Encoder-Decoder Language Model","summary":" Current protein language models (pLMs) predominantly focus on single-chain\nprotein sequences and often have not accounted for constraints on generative\ndesign imposed by protein-protein interactions. To address this gap, we present\npaired Antibody T5 (pAbT5), an encoder-decoder model to generate complementary\nheavy or light chain from its pairing partner. We show that our model respects\nconservation in framework regions and variability in hypervariable domains,\ndemonstrated by agreement with sequence alignment and variable-length CDR\nloops. We also show that our model captures chain pairing preferences through\nthe recovery of ground-truth chain type and gene families. Our results showcase\nthe potential of pAbT5 in generative antibody design, incorporating biological\nconstraints from chain pairing preferences.\n","authors":["Simon K. S. Chu","Kathy Y. Wei"],"pdf_url":"https://arxiv.org/pdf/2301.02748v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12131v1","updated":"2023-11-20T19:28:52Z","published":"2023-11-20T19:28:52Z","title":"Human Learning by Model Feedback: The Dynamics of Iterative Prompting\n with Midjourney","summary":" Generating images with a Text-to-Image model often requires multiple trials,\nwhere human users iteratively update their prompt based on feedback, namely the\noutput image. Taking inspiration from cognitive work on reference games and\ndialogue alignment, this paper analyzes the dynamics of the user prompts along\nsuch iterations. We compile a dataset of iterative interactions of human users\nwith Midjourney. Our analysis then reveals that prompts predictably converge\ntoward specific traits along these iterations. We further study whether this\nconvergence is due to human users, realizing they missed important details, or\ndue to adaptation to the model's ``preferences'', producing better images for a\nspecific language style. We show initial evidence that both possibilities are\nat play. The possibility that users adapt to the model's preference raises\nconcerns about reusing user data for further training. The prompts may be\nbiased towards the preferences of a specific model, rather than align with\nhuman intentions and natural manner of expression.\n","authors":["Shachar Don-Yehiya","Leshem Choshen","Omri Abend"],"pdf_url":"https://arxiv.org/pdf/2311.12131v1.pdf","comment":"EMNLP23"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.12028v1","updated":"2023-11-20T18:59:51Z","published":"2023-11-20T18:59:51Z","title":"Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose\n Estimation","summary":" Transformers have been successfully applied in the field of video-based 3D\nhuman pose estimation. However, the high computational costs of these video\npose transformers (VPTs) make them impractical on resource-constrained devices.\nIn this paper, we present a plug-and-play pruning-and-recovering framework,\ncalled Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose\nestimation from videos. Our HoT begins with pruning pose tokens of redundant\nframes and ends with recovering full-length tokens, resulting in a few pose\ntokens in the intermediate transformer blocks and thus improving the model\nefficiency. To effectively achieve this, we propose a token pruning cluster\n(TPC) that dynamically selects a few representative tokens with high semantic\ndiversity while eliminating the redundancy of video frames. In addition, we\ndevelop a token recovering attention (TRA) to restore the detailed\nspatio-temporal information based on the selected tokens, thereby expanding the\nnetwork output to the original full-length temporal resolution for fast\ninference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and\nMPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and\nestimation accuracy compared to the original VPT models. For instance, applying\nto MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs\nwithout sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop,\nrespectively. Our source code will be open-sourced.\n","authors":["Wenhao Li","Mengyuan Liu","Hong Liu","Pichao Wang","Jialun Cai","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2311.12028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12024v1","updated":"2023-11-20T18:57:55Z","published":"2023-11-20T18:57:55Z","title":"PF-LRM: Pose-Free Large Reconstruction Model for Joint Pose and Shape\n Prediction","summary":" We propose a Pose-Free Large Reconstruction Model (PF-LRM) for reconstructing\na 3D object from a few unposed images even with little visual overlap, while\nsimultaneously estimating the relative camera poses in ~1.3 seconds on a single\nA100 GPU. PF-LRM is a highly scalable method utilizing the self-attention\nblocks to exchange information between 3D object tokens and 2D image tokens; we\npredict a coarse point cloud for each view, and then use a differentiable\nPerspective-n-Point (PnP) solver to obtain camera poses. When trained on a huge\namount of multi-view posed data of ~1M objects, PF-LRM shows strong\ncross-dataset generalization ability, and outperforms baseline methods by a\nlarge margin in terms of pose prediction accuracy and 3D reconstruction quality\non various unseen evaluation datasets. We also demonstrate our model's\napplicability in downstream text/image-to-3D task with fast feed-forward\ninference. Our project website is at: https://totoro97.github.io/pf-lrm .\n","authors":["Peng Wang","Hao Tan","Sai Bi","Yinghao Xu","Fujun Luan","Kalyan Sunkavalli","Wenping Wang","Zexiang Xu","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12024v1.pdf","comment":"Project website: https://totoro97.github.io/pf-lrm"},{"id":"http://arxiv.org/abs/2311.12015v1","updated":"2023-11-20T18:54:39Z","published":"2023-11-20T18:54:39Z","title":"GPT-4V(ision) for Robotics: Multimodal Task Planning from Human\n Demonstration","summary":" We introduce a pipeline that enhances a general-purpose Vision Language\nModel, GPT-4V(ision), by integrating observations of human actions to\nfacilitate robotic manipulation. This system analyzes videos of humans\nperforming tasks and creates executable robot programs that incorporate\naffordance insights. The computation starts by analyzing the videos with GPT-4V\nto convert environmental and action details into text, followed by a\nGPT-4-empowered task planner. In the following analyses, vision systems\nreanalyze the video with the task plan. Object names are grounded using an\nopen-vocabulary object detector, while focus on the hand-object relation helps\nto detect the moment of grasping and releasing. This spatiotemporal grounding\nallows the vision systems to further gather affordance data (e.g., grasp type,\nway points, and body postures). Experiments across various scenarios\ndemonstrate this method's efficacy in achieving real robots' operations from\nhuman demonstrations in a zero-shot manner. The prompts of GPT-4V/GPT-4 are\navailable at this project page:\nhttps://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2311.12015v1.pdf","comment":"8 pages, 10 figures, 1 table. Last updated on November 20th, 2023"},{"id":"http://arxiv.org/abs/2311.11992v1","updated":"2023-11-20T18:23:41Z","published":"2023-11-20T18:23:41Z","title":"Exploring Lip Segmentation Techniques in Computer Vision: A Comparative\n Analysis","summary":" Lip segmentation is crucial in computer vision, especially for lip reading.\nDespite extensive face segmentation research, lip segmentation has received\nlimited attention. The aim of this study is to compare state-of-the-art lip\nsegmentation models using a standardized setting and a publicly available\ndataset. Five techniques, namely EHANet, Mask2Former, BiSeNet V2, PIDNet, and\nSTDC1, are qualitatively selected based on their reported performance,\ninference time, code availability, recency, and popularity. The CelebAMask-HQ\ndataset, comprising manually annotated face images, is used to fairly assess\nthe lip segmentation performance of the selected models. Inference experiments\nare conducted on a Raspberry Pi4 to emulate limited computational resources.\nThe results show that Mask2Former and EHANet have the best performances in\nterms of mIoU score. BiSeNet V2 demonstrate competitive performance, while\nPIDNet excels in recall but has lower precision. Most models present inference\ntime ranging from 1000 to around 3000 milliseconds on a Raspberry Pi4, with\nPIDNet having the lowest mean inference time. This study provides a\ncomprehensive evaluation of lip segmentation models, highlighting their\nperformance and inference times. The findings contribute to the development of\nlightweight techniques and establish benchmarks for future advances in lip\nsegmentation, especially in IoT and edge computing scenarios.\n","authors":["Pietro B. S. Masur","Francisco Braulio Oliveira","Lucas Moreira Medino","Emanuel Huber","Milene Haraguchi Padilha","Cassio de Alcantara","Renata Sellaro"],"pdf_url":"https://arxiv.org/pdf/2311.11992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11988v1","updated":"2023-11-20T18:21:18Z","published":"2023-11-20T18:21:18Z","title":"Categorizing the Visual Environment and Analyzing the Visual Attention\n of Dogs","summary":" Dogs have a unique evolutionary relationship with humans and serve many\nimportant roles e.g. search and rescue, blind assistance, emotional support.\nHowever, few datasets exist to categorize visual features and objects available\nto dogs, as well as how dogs direct their visual attention within their\nenvironment. We collect and study a dataset with over 11,698 gazes to\ncategorize the objects available to be gazed at by 11 dogs in everyday outdoor\nenvironments i.e. a walk around a college campus and urban area. We explore the\navailability of these object categories and the visual attention of dogs over\nthese categories using a head mounted eye tracking apparatus. A small portion\n(approx. 600 images or < 20% of total dataset) of the collected data is used to\nfine tune a MaskRCNN for the novel image domain to segment objects present in\nthe scene, enabling further statistical analysis on the visual gaze tendencies\nof dogs. The MaskRCNN, with eye tracking apparatus, serves as an end to end\nmodel for automatically classifying the visual fixations of dogs. The fine\ntuned MaskRCNN performs far better than chance. There are few individual\ndifferences between the 11 dogs and we observe greater visual fixations on\nbuses, plants, pavement, and construction equipment. This work takes a step\ntowards understanding visual behavior of dogs and their interaction with the\nphysical world.\n","authors":["Shreyas Sundara Raman","Madeline H. Pelgrim","Daphna Buchsbaum","Thomas Serre"],"pdf_url":"https://arxiv.org/pdf/2311.11988v1.pdf","comment":"13 pages, 11 figures, 1 table, WACV CV4Smalls Workshop"},{"id":"http://arxiv.org/abs/2311.11980v1","updated":"2023-11-20T18:14:53Z","published":"2023-11-20T18:14:53Z","title":"Leveraging Previous Facial Action Units Knowledge for Emotion\n Recognition on Faces","summary":" People naturally understand emotions, thus permitting a machine to do the\nsame could open new paths for human-computer interaction. Facial expressions\ncan be very useful for emotion recognition techniques, as these are the biggest\ntransmitters of non-verbal cues capable of being correlated with emotions.\nSeveral techniques are based on Convolutional Neural Networks (CNNs) to extract\ninformation in a machine learning process. However, simple CNNs are not always\nsufficient to locate points of interest on the face that can be correlated with\nemotions. In this work, we intend to expand the capacity of emotion recognition\ntechniques by proposing the usage of Facial Action Units (AUs) recognition\ntechniques to recognize emotions. This recognition will be based on the Facial\nAction Coding System (FACS) and computed by a machine learning system. In\nparticular, our method expands over EmotiRAM, an approach for multi-cue emotion\nrecognition, in which we improve over their facial encoding module.\n","authors":["Pietro B. S. Masur","Willams Costa","Lucas S. Figueredo","Veronica Teichrieb"],"pdf_url":"https://arxiv.org/pdf/2311.11980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11974v1","updated":"2023-11-20T18:02:20Z","published":"2023-11-20T18:02:20Z","title":"Evaluating Supervision Levels Trade-Offs for Infrared-Based People\n Counting","summary":" Object detection models are commonly used for people counting (and\nlocalization) in many applications but require a dataset with costly bounding\nbox annotations for training. Given the importance of privacy in people\ncounting, these models rely more and more on infrared images, making the task\neven harder. In this paper, we explore how weaker levels of supervision can\naffect the performance of deep person counting architectures for image\nclassification and point-level localization. Our experiments indicate that\ncounting people using a CNN Image-Level model achieves competitive results with\nYOLO detectors and point-level models, yet provides a higher frame rate and a\nsimilar amount of model parameters.\n","authors":["David Latortue","Moetez Kdayem","Fidel A Guerrero Peña","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2311.11974v1.pdf","comment":"Accepted in IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2024"},{"id":"http://arxiv.org/abs/2311.11971v1","updated":"2023-11-20T17:59:28Z","published":"2023-11-20T17:59:28Z","title":"LiDAR-HMR: 3D Human Mesh Recovery from LiDAR","summary":" In recent years, point cloud perception tasks have been garnering increasing\nattention. This paper presents the first attempt to estimate 3D human body mesh\nfrom sparse LiDAR point clouds. We found that the major challenge in estimating\nhuman pose and mesh from point clouds lies in the sparsity, noise, and\nincompletion of LiDAR point clouds. Facing these challenges, we propose an\neffective sparse-to-dense reconstruction scheme to reconstruct 3D human mesh.\nThis involves estimating a sparse representation of a human (3D human pose) and\ngradually reconstructing the body mesh. To better leverage the 3D structural\ninformation of point clouds, we employ a cascaded graph transformer\n(graphormer) to introduce point cloud features during sparse-to-dense\nreconstruction. Experimental results on three publicly available databases\ndemonstrate the effectiveness of the proposed approach. Code:\nhttps://github.com/soullessrobot/LiDAR-HMR/\n","authors":["Bohao Fan","Wenzhao Zheng","Jianjiang Feng","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.11971v1.pdf","comment":"Code is available at: https://github.com/soullessrobot/LiDAR-HMR/"},{"id":"http://arxiv.org/abs/2311.11969v1","updated":"2023-11-20T17:59:03Z","published":"2023-11-20T17:59:03Z","title":"SA-Med2D-20M Dataset: Segment Anything in 2D Medical Imaging with 20\n Million masks","summary":" Segment Anything Model (SAM) has achieved impressive results for natural\nimage segmentation with input prompts such as points and bounding boxes. Its\nsuccess largely owes to massive labeled training data. However, directly\napplying SAM to medical image segmentation cannot perform well because SAM\nlacks medical knowledge -- it does not use medical images for training. To\nincorporate medical knowledge into SAM, we introduce SA-Med2D-20M, a\nlarge-scale segmentation dataset of 2D medical images built upon numerous\npublic and private datasets. It consists of 4.6 million 2D medical images and\n19.7 million corresponding masks, covering almost the whole body and showing\nsignificant diversity. This paper describes all the datasets collected in\nSA-Med2D-20M and details how to process these datasets. Furthermore,\ncomprehensive statistics of SA-Med2D-20M are presented to facilitate the better\nuse of our dataset, which can help the researchers build medical vision\nfoundation models or apply their models to downstream medical applications. We\nhope that the large scale and diversity of SA-Med2D-20M can be leveraged to\ndevelop medical artificial intelligence for enhancing diagnosis, medical image\nanalysis, knowledge sharing, and education. The data with the redistribution\nlicense is publicly available at https://github.com/OpenGVLab/SAM-Med2D.\n","authors":["Jin Ye","Junlong Cheng","Jianpin Chen","Zhongying Deng","Tianbin Li","Haoyu Wang","Yanzhou Su","Ziyan Huang","Jilong Chen","Lei Jiang","Hui Sun","Min Zhu","Shaoting Zhang","Junjun He","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2311.11969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11963v1","updated":"2023-11-20T17:43:09Z","published":"2023-11-20T17:43:09Z","title":"What Can AutoML Do For Continual Learning?","summary":" This position paper outlines the potential of AutoML for incremental\n(continual) learning to encourage more research in this direction. Incremental\nlearning involves incorporating new data from a stream of tasks and\ndistributions to learn enhanced deep representations and adapt better to new\ntasks. However, a significant limitation of incremental learners is that most\ncurrent techniques freeze the backbone architecture, hyperparameters, and the\norder & structure of the learning tasks throughout the learning and adaptation\nprocess. We strongly believe that AutoML offers promising solutions to address\nthese limitations, enabling incremental learning to adapt to more diverse\nreal-world tasks. Therefore, instead of directly proposing a new method, this\npaper takes a step back by posing the question: \"What can AutoML do for\nincremental learning?\" We outline three key areas of research that can\ncontribute to making incremental learners more dynamic, highlighting concrete\nopportunities to apply AutoML methods in novel ways as well as entirely new\nchallenges for AutoML research.\n","authors":["Mert Kilickaya","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2311.11963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11961v1","updated":"2023-11-20T17:38:35Z","published":"2023-11-20T17:38:35Z","title":"NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly\n Generation","summary":" Anomaly detection (AD) is essential in identifying rare and often critical\nevents in complex systems, finding applications in fields such as network\nintrusion detection, financial fraud detection, and fault detection in\ninfrastructure and industrial systems. While AD is typically treated as an\nunsupervised learning task due to the high cost of label annotation, it is more\npractical to assume access to a small set of labeled anomaly samples from\ndomain experts, as is the case for semi-supervised anomaly detection.\nSemi-supervised and supervised approaches can leverage such labeled data,\nresulting in improved performance. In this paper, rather than proposing a new\nsemi-supervised or supervised approach for AD, we introduce a novel algorithm\nfor generating additional pseudo-anomalies on the basis of the limited labeled\nanomalies and a large volume of unlabeled data. This serves as an augmentation\nto facilitate the detection of new anomalies. Our proposed algorithm, named\nNearest Neighbor Gaussian Mixup (NNG-Mix), efficiently integrates information\nfrom both labeled and unlabeled data to generate pseudo-anomalies. We compare\nthe performance of this novel algorithm with commonly applied augmentation\ntechniques, such as Mixup and Cutout. We evaluate NNG-Mix by training various\nexisting semi-supervised and supervised anomaly detection algorithms on the\noriginal training data along with the generated pseudo-anomalies. Through\nextensive experiments on 57 benchmark datasets in ADBench, reflecting different\ndata types, we demonstrate that NNG-Mix outperforms other data augmentation\nmethods. It yields significant performance improvements compared to the\nbaselines trained exclusively on the original training data. Notably, NNG-Mix\nyields up to 16.4%, 8.8%, and 8.0% improvements on Classical, CV, and NLP\ndatasets in ADBench. Our source code will be available at\nhttps://github.com/donghao51/NNG-Mix.\n","authors":["Hao Dong","Gaëtan Frusque","Yue Zhao","Eleni Chatzi","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2311.11961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13849v2","updated":"2023-11-20T17:23:17Z","published":"2023-10-20T22:47:40Z","title":"A Dual-Stream Neural Network Explains the Functional Segregation of\n Dorsal and Ventral Visual Pathways in Human Brains","summary":" The human visual system uses two parallel pathways for spatial processing and\nobject recognition. In contrast, computer vision systems tend to use a single\nfeedforward pathway, rendering them less robust, adaptive, or efficient than\nhuman vision. To bridge this gap, we developed a dual-stream vision model\ninspired by the human eyes and brain. At the input level, the model samples two\ncomplementary visual patterns to mimic how the human eyes use magnocellular and\nparvocellular retinal ganglion cells to separate retinal inputs to the brain.\nAt the backend, the model processes the separate input patterns through two\nbranches of convolutional neural networks (CNN) to mimic how the human brain\nuses the dorsal and ventral cortical pathways for parallel visual processing.\nThe first branch (WhereCNN) samples a global view to learn spatial attention\nand control eye movements. The second branch (WhatCNN) samples a local view to\nrepresent the object around the fixation. Over time, the two branches interact\nrecurrently to build a scene representation from moving fixations. We compared\nthis model with the human brains processing the same movie and evaluated their\nfunctional alignment by linear transformation. The WhereCNN and WhatCNN\nbranches were found to differentially match the dorsal and ventral pathways of\nthe visual cortex, respectively, primarily due to their different learning\nobjectives. These model-based results lead us to speculate that the distinct\nresponses and representations of the ventral and dorsal streams are more\ninfluenced by their distinct goals in visual attention and object recognition\nthan by their specific bias or selectivity in retinal inputs. This dual-stream\nmodel takes a further step in brain-inspired computer vision, enabling parallel\nneural networks to actively explore and understand the visual surroundings.\n","authors":["Minkyu Choi","Kuan Han","Xiaokai Wang","Yizhen Zhang","Zhongming Liu"],"pdf_url":"https://arxiv.org/pdf/2310.13849v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11919v1","updated":"2023-11-20T16:54:07Z","published":"2023-11-20T16:54:07Z","title":"An Image is Worth Multiple Words: Multi-attribute Inversion for\n Constrained Text-to-Image Synthesis","summary":" We consider the problem of constraining diffusion model outputs with a\nuser-supplied reference image. Our key objective is to extract multiple\nattributes (e.g., color, object, layout, style) from this single reference\nimage, and then generate new samples with them. One line of existing work\nproposes to invert the reference images into a single textual conditioning\nvector, enabling generation of new samples with this learned token. These\nmethods, however, do not learn multiple tokens that are necessary to condition\nmodel outputs on the multiple attributes noted above. Another line of\ntechniques expand the inversion space to learn multiple embeddings but they do\nthis only along the layer dimension (e.g., one per layer of the DDPM model) or\nthe timestep dimension (one for a set of timesteps in the denoising process),\nleading to suboptimal attribute disentanglement. To address the aforementioned\ngaps, the first contribution of this paper is an extensive analysis to\ndetermine which attributes are captured in which dimension of the denoising\nprocess. As noted above, we consider both the time-step dimension (in reverse\ndenoising) as well as the DDPM model layer dimension. We observe that often a\nsubset of these attributes are captured in the same set of model layers and/or\nacross same denoising timesteps. For instance, color and style are captured\nacross same U-Net layers, whereas layout and color are captured across same\ntimestep stages. Consequently, an inversion process that is designed only for\nthe time-step dimension or the layer dimension is insufficient to disentangle\nall attributes. This leads to our second contribution where we design a new\nmulti-attribute inversion algorithm, MATTE, with associated\ndisentanglement-enhancing regularization losses, that operates across both\ndimensions and explicitly leads to four disentangled tokens (color, style,\nlayout, and object).\n","authors":["Aishwarya Agarwal","Srikrishna Karanam","Tripti Shukla","Balaji Vasan Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2311.11919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11910v1","updated":"2023-11-20T16:40:48Z","published":"2023-11-20T16:40:48Z","title":"Generalization of Fitness Exercise Recognition from Doppler Measurements\n by Domain-adaption and Few-Shot Learning","summary":" In previous works, a mobile application was developed using an unmodified\ncommercial off-the-shelf smartphone to recognize whole-body exercises. The\nworking principle was based on the ultrasound Doppler sensing with the device\nbuilt-in hardware. Applying such a lab-environment trained model on realistic\napplication variations causes a significant drop in performance, and thus\ndecimate its applicability. The reason of the reduced performance can be\nmanifold. It could be induced by the user, environment, and device variations\nin realistic scenarios. Such scenarios are often more complex and diverse,\nwhich can be challenging to anticipate in the initial training data. To study\nand overcome this issue, this paper presents a database with controlled and\nuncontrolled subsets of fitness exercises. We propose two concepts to utilize\nsmall adaption data to successfully improve model generalization in an\nuncontrolled environment, increasing the recognition accuracy by two to six\nfolds compared to the baseline for different users.\n","authors":["Biying Fu","Naser Damer","Florian Kirchbuchner","Arjan Kuijper"],"pdf_url":"https://arxiv.org/pdf/2311.11910v1.pdf","comment":"accepted at International Conference on Pattern Recognition (ICPR)\n workshop 2021"},{"id":"http://arxiv.org/abs/2311.11908v1","updated":"2023-11-20T16:40:29Z","published":"2023-11-20T16:40:29Z","title":"Continual Learning: Applications and the Road Forward","summary":" Continual learning is a sub-field of machine learning, which aims to allow\nmachine learning models to continuously learn on new data, by accumulating\nknowledge without forgetting what was learned in the past. In this work, we\ntake a step back, and ask: \"Why should one care about continual learning in the\nfirst place?\". We set the stage by surveying recent continual learning papers\npublished at three major machine learning conferences, and show that\nmemory-constrained settings dominate the field. Then, we discuss five open\nproblems in machine learning, and even though they seem unrelated to continual\nlearning at first sight, we show that continual learning will inevitably be\npart of their solution. These problems are model-editing, personalization,\non-device learning, faster (re-)training and reinforcement learning. Finally,\nby comparing the desiderata from these unsolved problems and the current\nassumptions in continual learning, we highlight and discuss four future\ndirections for continual learning research. We hope that this work offers an\ninteresting perspective on the future of continual learning, while displaying\nits potential value and the paths we have to pursue in order to make it\nsuccessful. This work is the result of the many discussions the authors had at\nthe Dagstuhl seminar on Deep Continual Learning, in March 2023.\n","authors":["Eli Verwimp","Shai Ben-David","Matthias Bethge","Andrea Cossu","Alexander Gepperth","Tyler L. Hayes","Eyke Hüllermeier","Christopher Kanan","Dhireesha Kudithipudi","Christoph H. Lampert","Martin Mundt","Razvan Pascanu","Adrian Popescu","Andreas S. Tolias","Joost van de Weijer","Bing Liu","Vincenzo Lomonaco","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.11908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11904v1","updated":"2023-11-20T16:37:45Z","published":"2023-11-20T16:37:45Z","title":"LLMs as Visual Explainers: Advancing Image Classification with Evolving\n Visual Descriptions","summary":" Vision-language models (VLMs) offer a promising paradigm for image\nclassification by comparing the similarity between images and class embeddings.\nA critical challenge lies in crafting precise textual representations for class\nnames. While previous studies have leveraged recent advancements in large\nlanguage models (LLMs) to enhance these descriptors, their outputs often suffer\nfrom ambiguity and inaccuracy. We identify two primary causes: 1) The prevalent\nreliance on textual interactions with LLMs, leading to a mismatch between the\ngenerated text and the visual content in VLMs' latent space - a phenomenon we\nterm the \"explain without seeing\" dilemma. 2) The oversight of the inter-class\nrelationships, resulting in descriptors that fail to differentiate similar\nclasses effectively. To address these issues, we propose a novel image\nclassification framework combining VLMs with LLMs, named Iterative Optimization\nwith Visual Feedback. In particular, our method develops an LLM-based agent,\nemploying an evolutionary optimization strategy to refine class descriptors.\nCrucially, we incorporate visual feedback from VLM classification metrics,\nthereby guiding the optimization process with concrete visual data. Our method\nleads to improving accuracy on a wide range of image classification benchmarks,\nwith 3.47\\% average gains over state-of-the-art methods. We also highlight the\nresulting descriptions serve as explainable and robust features that can\nconsistently improve the performance across various backbone models.\n","authors":["Songhao Han","Le Zhuo","Yue Liao","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2311.11904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.13677v3","updated":"2023-11-20T16:37:05Z","published":"2021-11-26T18:59:38Z","title":"SWAT: Spatial Structure Within and Among Tokens","summary":" Modeling visual data as tokens (i.e., image patches) using attention\nmechanisms, feed-forward networks or convolutions has been highly effective in\nrecent years. Such methods usually have a common pipeline: a tokenization\nmethod, followed by a set of layers/blocks for information mixing, both within\nand among tokens. When image patches are converted into tokens, they are often\nflattened, discarding the spatial structure within each patch. As a result, any\nprocessing that follows (eg: multi-head self-attention) may fail to recover\nand/or benefit from such information. In this paper, we argue that models can\nhave significant gains when spatial structure is preserved during tokenization,\nand is explicitly used during the mixing stage. We propose two key\ncontributions: (1) Structure-aware Tokenization and, (2) Structure-aware\nMixing, both of which can be combined with existing models with minimal effort.\nWe introduce a family of models (SWAT), showing improvements over the likes of\nDeiT, MLP-Mixer and Swin Transformer, across multiple benchmarks including\nImageNet classification and ADE20K segmentation. Our code is available at\nhttps://github.com/kkahatapitiya/SWAT.\n","authors":["Kumara Kahatapitiya","Michael S. Ryoo"],"pdf_url":"https://arxiv.org/pdf/2111.13677v3.pdf","comment":"Accepted to be published at IJCAI23"},{"id":"http://arxiv.org/abs/2311.11901v1","updated":"2023-11-20T16:35:16Z","published":"2023-11-20T16:35:16Z","title":"Identifying the Defective: Detecting Damaged Grains for Cereal\n Appearance Inspection","summary":" Cereal grain plays a crucial role in the human diet as a major source of\nessential nutrients. Grain Appearance Inspection (GAI) serves as an essential\nprocess to determine grain quality and facilitate grain circulation and\nprocessing. However, GAI is routinely performed manually by inspectors with\ncumbersome procedures, which poses a significant bottleneck in smart\nagriculture.\n In this paper, we endeavor to develop an automated GAI system:AI4GrainInsp.\nBy analyzing the distinctive characteristics of grain kernels, we formulate GAI\nas a ubiquitous problem: Anomaly Detection (AD), in which healthy and edible\nkernels are considered normal samples while damaged grains or unknown objects\nare regarded as anomalies. We further propose an AD model, called AD-GAI, which\nis trained using only normal samples yet can identify anomalies during\ninference. Moreover, we customize a prototype device for data acquisition and\ncreate a large-scale dataset including 220K high-quality images of wheat and\nmaize kernels. Through extensive experiments, AD-GAI achieves considerable\nperformance in comparison with advanced AD methods, and AI4GrainInsp has highly\nconsistent performance compared to human experts and excels at inspection\nefficiency over 20x speedup. The dataset, code and models will be released at\nhttps://github.com/hellodfan/AI4GrainInsp.\n","authors":["Lei Fan","Yiwen Ding","Dongdong Fan","Yong Wu","Maurice Pagnucco","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2311.11901v1.pdf","comment":"Accepted by ECAI2023. https://github.com/hellodfan/AI4GrainInsp"},{"id":"http://arxiv.org/abs/2311.11888v1","updated":"2023-11-20T16:21:37Z","published":"2023-11-20T16:21:37Z","title":"SniffyArt: The Dataset of Smelling Persons","summary":" Smell gestures play a crucial role in the investigation of past smells in the\nvisual arts yet their automated recognition poses significant challenges. This\npaper introduces the SniffyArt dataset, consisting of 1941 individuals\nrepresented in 441 historical artworks. Each person is annotated with a tightly\nfitting bounding box, 17 pose keypoints, and a gesture label. By integrating\nthese annotations, the dataset enables the development of hybrid classification\napproaches for smell gesture recognition. The datasets high-quality human pose\nestimation keypoints are achieved through the merging of five separate sets of\nkeypoint annotations per person. The paper also presents a baseline analysis,\nevaluating the performance of representative algorithms for detection, keypoint\nestimation, and classification tasks, showcasing the potential of combining\nkeypoint estimation with smell gesture classification. The SniffyArt dataset\nlays a solid foundation for future research and the exploration of multi-task\napproaches leveraging pose keypoints and person boxes to advance human gesture\nand olfactory dimension analysis in historical artworks.\n","authors":["Mathias Zinnen","Azhar Hussian","Hang Tran","Prathmesh Madhu","Andreas Maier","Vincent Christlein"],"pdf_url":"https://arxiv.org/pdf/2311.11888v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.11882v1","updated":"2023-11-20T16:19:46Z","published":"2023-11-20T16:19:46Z","title":"Multi-Task Faces (MTF) Data Set: A Legally and Ethically Compliant\n Collection of Face Images for Various Classification Tasks","summary":" Human facial data hold tremendous potential to address a variety of\nclassification problems, including face recognition, age estimation, gender\nidentification, emotion analysis, and race classification. However, recent\nprivacy regulations, such as the EU General Data Protection Regulation and\nothers, have restricted the ways in which human images may be collected and\nused for research. As a result, several previously published data sets\ncontaining human faces have been removed from the internet due to inadequate\ndata collection methods that failed to meet privacy regulations. Data sets\nconsisting of synthetic data have been proposed as an alternative, but they\nfall short of accurately representing the real data distribution. On the other\nhand, most available data sets are labeled for just a single task, which limits\ntheir applicability. To address these issues, we present the Multi-Task Faces\n(MTF) image data set, a meticulously curated collection of face images designed\nfor various classification tasks, including face recognition, as well as race,\ngender, and age classification. The MTF data set has been ethically gathered by\nleveraging publicly available images of celebrities and strictly adhering to\ncopyright regulations. In this paper, we present this data set and provide\ndetailed descriptions of the followed data collection and processing\nprocedures. Furthermore, we evaluate the performance of five deep learning (DL)\nmodels on the MTF data set across the aforementioned classification tasks.\nAdditionally, we compare the performance of DL models over the processed MTF\ndata and over raw data crawled from the internet. The reported results\nconstitute a baseline for further research employing these data. The MTF data\nset can be accessed through the following link (please cite the present paper\nif you use the data set): https://github.com/RamiHaf/MTF_data_set\n","authors":["Rami Haffar","David Sánchez","Josep Domingo-Ferrer"],"pdf_url":"https://arxiv.org/pdf/2311.11882v1.pdf","comment":"21 pages, 2 figures, 9 Tables,"},{"id":"http://arxiv.org/abs/2310.04741v3","updated":"2023-11-20T16:09:07Z","published":"2023-10-07T08:54:43Z","title":"Balancing stability and plasticity in continual learning: the\n readout-decomposition of activation change (RDAC) framework","summary":" Continual learning (CL) algorithms strive to acquire new knowledge while\npreserving prior information. However, this stability-plasticity trade-off\nremains a central challenge. This paper introduces a framework that dissects\nthis trade-off, offering valuable insights into CL algorithms. The\nReadout-Decomposition of Activation Change (RDAC) framework first addresses the\nstability-plasticity dilemma and its relation to catastrophic forgetting. It\nrelates learning-induced activation changes in the range of prior readouts to\nthe degree of stability and changes in the null space to the degree of\nplasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the\nframework clarifies the stability-plasticity trade-offs of the popular\nregularization algorithms Synaptic intelligence (SI), Elastic-weight\nconsolidation (EWC), and learning without Forgetting (LwF), and replay-based\nalgorithms Gradient episodic memory (GEM), and data replay. GEM and data replay\npreserved stability and plasticity, while SI, EWC, and LwF traded off\nplasticity for stability. The inability of the regularization algorithms to\nmaintain plasticity was linked to them restricting the change of activations in\nthe null space of the prior readout. Additionally, for one-hidden-layer linear\nneural networks, we derived a gradient decomposition algorithm to restrict\nactivation change only in the range of the prior readouts, to maintain high\nstability while not further sacrificing plasticity. Results demonstrate that\nthe algorithm maintained stability without significant plasticity loss. The\nRDAC framework informs the behavior of existing CL algorithms and paves the way\nfor novel CL approaches. Finally, it sheds light on the connection between\nlearning-induced activation/representation changes and the stability-plasticity\ndilemma, also offering insights into representational drift in biological\nsystems.\n","authors":["Daniel Anthes","Sushrut Thorat","Peter König","Tim C. Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2310.04741v3.pdf","comment":"15 pages, 5 figures, Revision"},{"id":"http://arxiv.org/abs/2311.11865v1","updated":"2023-11-20T16:02:10Z","published":"2023-11-20T16:02:10Z","title":"VLM-Eval: A General Evaluation on Video Large Language Models","summary":" Despite the rapid development of video Large Language Models (LLMs), a\ncomprehensive evaluation is still absent. In this paper, we introduce a unified\nevaluation that encompasses multiple video tasks, including captioning,\nquestion and answering, retrieval, and action recognition. In addition to\nconventional metrics, we showcase how GPT-based evaluation can match human-like\nperformance in assessing response quality across multiple aspects. We propose a\nsimple baseline: Video-LLaVA, which uses a single linear projection and\noutperforms existing video LLMs. Finally, we evaluate video LLMs beyond\nacademic datasets, which show encouraging recognition and reasoning\ncapabilities in driving scenarios with only hundreds of video-instruction pairs\nfor fine-tuning. We hope our work can serve as a unified evaluation for video\nLLMs, and help expand more practical scenarios. The evaluation code will be\navailable soon.\n","authors":["Shuailin Li","Yuang Zhang","Yucheng Zhao","Qiuyue Wang","Fan Jia","Yingfei Liu","Tiancai Wang"],"pdf_url":"https://arxiv.org/pdf/2311.11865v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15778v2","updated":"2023-11-20T15:59:42Z","published":"2023-10-24T12:25:37Z","title":"Preserving Patient Privacy in MRI Scans: A Comprehensive Approach with\n 3D Masked Autoencoders","summary":" MRI scans provide valuable medical information, however they also contain\nsensitive and personally identifiable information (PII) that needs to be\nprotected. Whereas MRI metadata is easily sanitized, MRI image data is a\nprivacy risk because it contains information to render highly-realistic 3D\nvisualizations of a patient's head, enabling malicious actors to possibly\nidentify the subject by cross-referencing a database. Data anonymization and\nde-identification is concerned with ensuring the privacy and confidentiality of\nindividuals' personal information. Traditional MRI de-identification methods\nremove privacy-sensitive parts (e.g. eyes, nose etc.) from a given scan. This\ncomes at the expense of introducing a domain shift that can throw off\ndownstream analyses. Recently, a GAN-based approach was proposed to de-identify\na patient's scan by remodeling it (\\eg changing the face) rather than by\nremoving parts. In this work, we propose CP-MAE, a model that de-identifies the\nface using masked autoencoders and that outperforms all previous approaches in\nterms of downstream task performance as well as de-identification. With our\nmethod we are able to synthesize scans of resolution up to $256^3$ (previously\n$128^3$) which constitutes an eight-fold increase in the number of voxels.\nUsing our construction we were able to design a system that exhibits a highly\nrobust training stage, making it easy to fit the network on novel data.\n","authors":["Lennart Alexander Van der Goten","Kevin Smith"],"pdf_url":"https://arxiv.org/pdf/2310.15778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11863v1","updated":"2023-11-20T15:59:41Z","published":"2023-11-20T15:59:41Z","title":"GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene\n Understanding","summary":" Applying NeRF to downstream perception tasks for scene understanding and\nrepresentation is becoming increasingly popular. Most existing methods treat\nsemantic prediction as an additional rendering task, \\textit{i.e.}, the \"label\nrendering\" task, to build semantic NeRFs. However, by rendering\nsemantic/instance labels per pixel without considering the contextual\ninformation of the rendered image, these methods usually suffer from unclear\nboundary segmentation and abnormal segmentation of pixels within an object. To\nsolve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel\npipeline that makes the widely used segmentation model and NeRF work compatibly\nunder a unified framework, for facilitating context-aware 3D scene perception.\nTo accomplish this goal, we introduce transformers to aggregate radiance as\nwell as semantic embedding fields jointly for novel views and facilitate the\njoint volumetric rendering of both fields. In addition, we propose two\nself-distillation mechanisms, i.e., the Semantic Distill Loss and the\nDepth-Guided Semantic Distill Loss, to enhance the discrimination and quality\nof the semantic field and the maintenance of geometric consistency. In\nevaluation, we conduct experimental comparisons under two perception tasks\n(\\textit{i.e.} semantic and instance segmentation) using both synthetic and\nreal-world datasets. Notably, our method outperforms SOTA approaches by 6.94\\%,\n11.76\\%, and 8.47\\% on generalized semantic segmentation, finetuning semantic\nsegmentation, and instance segmentation, respectively.\n","authors":["Hao Li","Dingwen Zhang","Yalun Dai","Nian Liu","Lechao Cheng","Jingfeng Li","Jingdong Wang","Junwei Han"],"pdf_url":"https://arxiv.org/pdf/2311.11863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11860v1","updated":"2023-11-20T15:56:44Z","published":"2023-11-20T15:56:44Z","title":"LION : Empowering Multimodal Large Language Model with Dual-Level Visual\n Knowledge","summary":" Multimodal Large Language Models (MLLMs) have endowed LLMs with the ability\nto perceive and understand multi-modal signals. However, most of the existing\nMLLMs mainly adopt vision encoders pretrained on coarsely aligned image-text\npairs, leading to insufficient extraction and reasoning of visual knowledge. To\naddress this issue, we devise a dual-Level vIsual knOwledge eNhanced Multimodal\nLarge Language Model (LION), which empowers the MLLM by injecting visual\nknowledge in two levels. 1) Progressive incorporation of fine-grained\nspatial-aware visual knowledge. We design a vision aggregator cooperated with\nregion-level vision-language (VL) tasks to incorporate fine-grained\nspatial-aware visual knowledge into the MLLM. To alleviate the conflict between\nimage-level and region-level VL tasks during incorporation, we devise a\ndedicated stage-wise instruction-tuning strategy with mixture-of-adapters. This\nprogressive incorporation scheme contributes to the mutual promotion between\nthese two kinds of VL tasks. 2) Soft prompting of high-level semantic visual\nevidence. We facilitate the MLLM with high-level semantic visual evidence by\nleveraging diverse image tags. To mitigate the potential influence caused by\nimperfect predicted tags, we propose a soft prompting method by embedding a\nlearnable token into the tailored text instruction. Comprehensive experiments\non several multi-modal benchmarks demonstrate the superiority of our model\n(e.g., improvement of 5% accuracy on VSR and 3% CIDEr on TextCaps over\nInstructBLIP, 5% accuracy on RefCOCOg over Kosmos-2).\n","authors":["Gongwei Chen","Leyang Shen","Rui Shao","Xiang Deng","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2311.11860v1.pdf","comment":"Technical Report. Project page:\n https://rshaojimmy.github.io/Projects/JiuTian-LION Code:\n https://github.com/rshaojimmy/JiuTian"},{"id":"http://arxiv.org/abs/2311.11856v1","updated":"2023-11-20T15:51:14Z","published":"2023-11-20T15:51:14Z","title":"FATURA: A Multi-Layout Invoice Image Dataset for Document Analysis and\n Understanding","summary":" Document analysis and understanding models often require extensive annotated\ndata to be trained. However, various document-related tasks extend beyond mere\ntext transcription, requiring both textual content and precise bounding-box\nannotations to identify different document elements. Collecting such data\nbecomes particularly challenging, especially in the context of invoices, where\nprivacy concerns add an additional layer of complexity. In this paper, we\nintroduce FATURA, a pivotal resource for researchers in the field of document\nanalysis and understanding. FATURA is a highly diverse dataset featuring\nmulti-layout, annotated invoice document images. Comprising $10,000$ invoices\nwith $50$ distinct layouts, it represents the largest openly accessible image\ndataset of invoice documents known to date. We also provide comprehensive\nbenchmarks for various document analysis and understanding tasks and conduct\nexperiments under diverse training and evaluation scenarios. The dataset is\nfreely accessible at https://zenodo.org/record/8261508, empowering researchers\nto advance the field of document analysis and understanding.\n","authors":["Mahmoud Limam","Marwa Dhiaf","Yousri Kessentini"],"pdf_url":"https://arxiv.org/pdf/2311.11856v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11853v1","updated":"2023-11-20T15:45:16Z","published":"2023-11-20T15:45:16Z","title":"Asynchronous Bioplausible Neuron for Spiking Neural Networks for\n Event-Based Vision","summary":" Spiking Neural Networks (SNNs) offer a biologically inspired approach to\ncomputer vision that can lead to more efficient processing of visual data with\nreduced energy consumption. However, maintaining homeostasis within these\nnetworks is challenging, as it requires continuous adjustment of neural\nresponses to preserve equilibrium and optimal processing efficiency amidst\ndiverse and often unpredictable input signals. In response to these challenges,\nwe propose the Asynchronous Bioplausible Neuron (ABN), a dynamic spike firing\nmechanism to auto-adjust the variations in the input signal. Comprehensive\nevaluation across various datasets demonstrates ABN's enhanced performance in\nimage classification and segmentation, maintenance of neural equilibrium, and\nenergy efficiency.\n","authors":["Sanket Kachole","Hussain Sajwani","Fariborz Baghaei Naeini","Dimitrios Makris","Yahya Zweiri"],"pdf_url":"https://arxiv.org/pdf/2311.11853v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.11845v1","updated":"2023-11-20T15:35:00Z","published":"2023-11-20T15:35:00Z","title":"Entangled View-Epipolar Information Aggregation for Generalizable Neural\n Radiance Fields","summary":" Generalizable NeRF can directly synthesize novel views across new scenes,\neliminating the need for scene-specific retraining in vanilla NeRF. A critical\nenabling factor in these approaches is the extraction of a generalizable 3D\nrepresentation by aggregating source-view features. In this paper, we propose\nan Entangled View-Epipolar Information Aggregation method dubbed EVE-NeRF.\nDifferent from existing methods that consider cross-view and along-epipolar\ninformation independently, EVE-NeRF conducts the view-epipolar feature\naggregation in an entangled manner by injecting the scene-invariant appearance\ncontinuity and geometry consistency priors to the aggregation process. Our\napproach effectively mitigates the potential lack of inherent geometric and\nappearance constraint resulting from one-dimensional interactions, thus further\nboosting the 3D representation generalizablity. EVE-NeRF attains\nstate-of-the-art performance across various evaluation scenarios. Extensive\nexperiments demonstate that, compared to prevailing single-dimensional\naggregation, the entangled network excels in the accuracy of 3D scene geometry\nand appearance reconstruction.Our project page is\nhttps://github.com/tatakai1/EVENeRF.\n","authors":["Zhiyuan Min","Yawei Luo","Wei Yang","Yuesong Wang","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2311.11845v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11837v1","updated":"2023-11-20T15:11:31Z","published":"2023-11-20T15:11:31Z","title":"Kandinsky Conformal Prediction: Efficient Calibration of Image\n Segmentation Algorithms","summary":" Image segmentation algorithms can be understood as a collection of pixel\nclassifiers, for which the outcomes of nearby pixels are correlated. Classifier\nmodels can be calibrated using Inductive Conformal Prediction, but this\nrequires holding back a sufficiently large calibration dataset for computing\nthe distribution of non-conformity scores of the model's predictions. If one\nonly requires only marginal calibration on the image level, this calibration\nset consists of all individual pixels in the images available for calibration.\nHowever, if the goal is to attain proper calibration for each individual pixel\nclassifier, the calibration set consists of individual images. In a scenario\nwhere data are scarce (such as the medical domain), it may not always be\npossible to set aside sufficiently many images for this pixel-level\ncalibration. The method we propose, dubbed ``Kandinsky calibration'', makes use\nof the spatial structure present in the distribution of natural images to\nsimultaneously calibrate the classifiers of ``similar'' pixels. This can be\nseen as an intermediate approach between marginal (imagewise) and conditional\n(pixelwise) calibration, where non-conformity scores are aggregated over\nsimilar image regions, thereby making more efficient use of the images\navailable for calibration. We run experiments on segmentation algorithms\ntrained and calibrated on subsets of the public MS-COCO and Medical Decathlon\ndatasets, demonstrating that Kandinsky calibration method can significantly\nimprove the coverage. When compared to both pixelwise and imagewise calibration\non little data, the Kandinsky method achieves much lower coverage errors,\nindicating the data efficiency of the Kandinsky calibration.\n","authors":["Joren Brunekreef","Eric Marcus","Ray Sheombarsing","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2311.11837v1.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.11827v1","updated":"2023-11-20T15:04:16Z","published":"2023-11-20T15:04:16Z","title":"Few-shot Multispectral Segmentation with Representations Generated by\n Reinforcement Learning","summary":" The task of multispectral image segmentation (segmentation of images with\nnumerous channels/bands, each capturing a specific range of wavelengths of\nelectromagnetic radiation) has been previously explored in contexts with large\namounts of labeled data. However, these models tend not to generalize well to\ndatasets of smaller size. In this paper, we propose a novel approach for\nimproving few-shot segmentation performance on multispectral images using\nreinforcement learning to generate representations. These representations are\ngenerated in the form of mathematical expressions between channels and are\ntailored to the specific class being segmented. Our methodology involves\ntraining an agent to identify the most informative expressions, updating the\ndataset using these expressions, and then using the updated dataset to perform\nsegmentation. Due to the limited length of the expressions, the model receives\nuseful representations without any added risk of overfitting. We evaluate the\neffectiveness of our approach on several multispectral datasets and demonstrate\nits effectiveness in boosting the performance of segmentation algorithms.\n","authors":["Dilith Jayakody","Thanuja Ambegoda"],"pdf_url":"https://arxiv.org/pdf/2311.11827v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.11825v1","updated":"2023-11-20T15:03:56Z","published":"2023-11-20T15:03:56Z","title":"Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning","summary":" In this work, we use multi-view aerial images to reconstruct the geometry,\nlighting, and material of facades using neural signed distance fields (SDFs).\nWithout the requirement of complex equipment, our method only takes simple RGB\nimages captured by a drone as inputs to enable physically based and\nphotorealistic novel-view rendering, relighting, and editing. However, a\nreal-world facade usually has complex appearances ranging from diffuse rocks\nwith subtle details to large-area glass windows with specular reflections,\nmaking it hard to attend to everything. As a result, previous methods can\npreserve the geometry details but fail to reconstruct smooth glass windows or\nverse vise. In order to address this challenge, we introduce three spatial- and\nsemantic-adaptive optimization strategies, including a semantic regularization\napproach based on zero-shot segmentation techniques to improve material\nconsistency, a frequency-aware geometry regularization to balance surface\nsmoothness and details in different surfaces, and a visibility probe-based\nscheme to enable efficient modeling of the local lighting in large-scale\noutdoor environments. In addition, we capture a real-world facade aerial 3D\nscanning image set and corresponding point clouds for training and\nbenchmarking. The experiment demonstrates the superior quality of our method on\nfacade holistic inverse rendering, novel view synthesis, and scene editing\ncompared to state-of-the-art baselines.\n","authors":["Zixuan Xie","Rengan Xie","Rong Li","Kai Huang","Pengju Qiao","Jingsen Zhu","Xu Yin","Qi Ye","Wei Hua","Yuchi Huo","Hujun Bao"],"pdf_url":"https://arxiv.org/pdf/2311.11825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07750v2","updated":"2023-11-20T15:01:19Z","published":"2023-11-13T21:07:07Z","title":"SynthEnsemble: A Fusion of CNN, Vision Transformer, and Hybrid Models\n for Multi-Label Chest X-Ray Classification","summary":" Chest X-rays are widely used to diagnose thoracic diseases, but the lack of\ndetailed information about these abnormalities makes it challenging to develop\naccurate automated diagnosis systems, which is crucial for early detection and\neffective treatment. To address this challenge, we employed deep learning\ntechniques to identify patterns in chest X-rays that correspond to different\ndiseases. We conducted experiments on the \"ChestX-ray14\" dataset using various\npre-trained CNNs, transformers, hybrid(CNN+Transformer) models and classical\nmodels. The best individual model was the CoAtNet, which achieved an area under\nthe receiver operating characteristic curve (AUROC) of 84.2%. By combining the\npredictions of all trained models using a weighted average ensemble where the\nweight of each model was determined using differential evolution, we further\nimproved the AUROC to 85.4%, outperforming other state-of-the-art methods in\nthis field. Our findings demonstrate the potential of deep learning techniques,\nparticularly ensemble deep learning, for improving the accuracy of automatic\ndiagnosis of thoracic diseases from chest X-rays.\n","authors":["S. M. Nabil Ashraf","Md. Adyelullahil Mamun","Hasnat Md. Abdullah","Md. Golam Rabiul Alam"],"pdf_url":"https://arxiv.org/pdf/2311.07750v2.pdf","comment":"Accepted in International Conference on Computer and Information\n Technology (ICCIT) 2023"},{"id":"http://arxiv.org/abs/2311.11821v1","updated":"2023-11-20T14:58:47Z","published":"2023-11-20T14:58:47Z","title":"Cross-View Graph Consistency Learning for Invariant Graph\n Representations","summary":" Graph representation learning is fundamental for analyzing graph-structured\ndata. Exploring invariant graph representations remains a challenge for most\nexisting graph representation learning methods. In this paper, we propose a\ncross-view graph consistency learning (CGCL) method that learns invariant graph\nrepresentations for link prediction. First, two complementary augmented views\nare derived from an incomplete graph structure through a bidirectional graph\nstructure augmentation scheme. This augmentation scheme mitigates the potential\ninformation loss that is commonly associated with various data augmentation\ntechniques involving raw graph data, such as edge perturbation, node removal,\nand attribute masking. Second, we propose a CGCL model that can learn invariant\ngraph representations. A cross-view training scheme is proposed to train the\nproposed CGCL model. This scheme attempts to maximize the consistency\ninformation between one augmented view and the graph structure reconstructed\nfrom the other augmented view. Furthermore, we offer a comprehensive\ntheoretical CGCL analysis. This paper empirically and experimentally\ndemonstrates the effectiveness of the proposed CGCL method, achieving\ncompetitive results on graph datasets in comparisons with several\nstate-of-the-art algorithms.\n","authors":["Jie Chen","Zhiming Li","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2311.11821v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.11819v1","updated":"2023-11-20T14:55:40Z","published":"2023-11-20T14:55:40Z","title":"Generalized super-resolution 4D Flow MRI -- using ensemble learning to\n extend across the cardiovascular system","summary":" 4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive\nmeasurement technique capable of quantifying blood flow across the\ncardiovascular system. While practical use is limited by spatial resolution and\nimage noise, incorporation of trained super-resolution (SR) networks has\npotential to enhance image quality post-scan. However, these efforts have\npredominantly been restricted to narrowly defined cardiovascular domains, with\nlimited exploration of how SR performance extends across the cardiovascular\nsystem; a task aggravated by contrasting hemodynamic conditions apparent across\nthe cardiovasculature. The aim of our study was to explore the generalizability\nof SR 4D Flow MRI using a combination of heterogeneous training sets and\ndedicated ensemble learning. With synthetic training data generated across\nthree disparate domains (cardiac, aortic, cerebrovascular), varying\nconvolutional base and ensemble learners were evaluated as a function of domain\nand architecture, quantifying performance on both in-silico and acquired\nin-vivo data from the same three domains. Results show that both bagging and\nstacking ensembling enhance SR performance across domains, accurately\npredicting high-resolution velocities from low-resolution input data in-silico.\nLikewise, optimized networks successfully recover native resolution velocities\nfrom downsampled in-vivo data, as well as show qualitative potential in\ngenerating denoised SR-images from clinical level input data. In conclusion,\nour work presents a viable approach for generalized SR 4D Flow MRI, with\nensemble learning extending utility across various clinical areas of interest.\n","authors":["Leon Ericsson","Adam Hjalmarsson","Muhammad Usman Akbar","Edward Ferdian","Mia Bonini","Brandon Hardy","Jonas Schollenberger","Maria Aristova","Patrick Winter","Nicholas Burris","Alexander Fyrdahl","Andreas Sigfridsson","Susanne Schnell","C. Alberto Figueroa","David Nordsletten","Alistair A. Young","David Marlevi"],"pdf_url":"https://arxiv.org/pdf/2311.11819v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.11815v1","updated":"2023-11-20T14:52:48Z","published":"2023-11-20T14:52:48Z","title":"CrackCLF: Automatic Pavement Crack Detection based on Closed-Loop\n Feedback","summary":" Automatic pavement crack detection is an important task to ensure the\nfunctional performances of pavements during their service life. Inspired by\ndeep learning (DL), the encoder-decoder framework is a powerful tool for crack\ndetection. However, these models are usually open-loop (OL) systems that tend\nto treat thin cracks as the background. Meanwhile, these models can not\nautomatically correct errors in the prediction, nor can it adapt to the changes\nof the environment to automatically extract and detect thin cracks. To tackle\nthis problem, we embed closed-loop feedback (CLF) into the neural network so\nthat the model could learn to correct errors on its own, based on generative\nadversarial networks (GAN). The resulting model is called CrackCLF and includes\nthe front and back ends, i.e. segmentation and adversarial network. The front\nend with U-shape framework is employed to generate crack maps, and the back end\nwith a multi-scale loss function is used to correct higher-order\ninconsistencies between labels and crack maps (generated by the front end) to\naddress open-loop system issues. Empirical results show that the proposed\nCrackCLF outperforms others methods on three public datasets. Moreover, the\nproposed CLF can be defined as a plug and play module, which can be embedded\ninto different neural network models to improve their performances.\n","authors":["Chong Li","Zhun Fan","Ying Chen","Huibiao Lin","Laura Moretti","Giuseppe Loprencipe","Weihua Sheng","Kelvin C. P. Wang"],"pdf_url":"https://arxiv.org/pdf/2311.11815v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11810v1","updated":"2023-11-20T14:42:25Z","published":"2023-11-20T14:42:25Z","title":"DocPedia: Unleashing the Power of Large Multimodal Model in the\n Frequency Domain for Versatile Document Understanding","summary":" This work presents DocPedia, a novel large multimodal model (LMM) for\nversatile OCR-free document understanding, capable of parsing images up to\n2,560$\\times$2,560 resolution. Unlike existing work either struggle with\nhigh-resolution documents or give up the large language model thus vision or\nlanguage ability constrained, our DocPedia directly processes visual input in\nthe frequency domain rather than the pixel space. The unique characteristic\nenables DocPedia to capture a greater amount of visual and textual information\nusing a limited number of visual tokens. To consistently enhance both\nperception and comprehension abilities of our model, we develop a dual-stage\ntraining strategy and enrich instructions/annotations of all training tasks\ncovering multiple document types. Extensive quantitative and qualitative\nexperiments conducted on various publicly available benchmarks confirm the\nmutual benefits of jointly learning perception and comprehension tasks. The\nresults provide further evidence of the effectiveness and superior performance\nof our DocPedia over other methods.\n","authors":["Hao Feng","Qi Liu","Hao Liu","Wengang Zhou","Houqiang Li","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2311.11810v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11808v1","updated":"2023-11-20T14:41:44Z","published":"2023-11-20T14:41:44Z","title":"Robot Hand-Eye Calibration using Structure-from-Motion","summary":" In this paper we propose a new flexible method for hand-eye calibration. The\nvast majority of existing hand-eye calibration techniques requires a\ncalibration rig which is used in conjunction with camera pose estimation\nmethods. Instead, we combine structure-from-motion with known robot motions and\nwe show that the solution can be obtained in linear form. The latter solves for\nboth the hand-eye parameters and for the unknown scale factor inherent with\nstructure-from-motion methods. The algebraic analysis that is made possible\nwith such a linear formulation allows to investigate not only the well known\ncase of general screw motions but also such singular motions as pure\ntranslations, pure rotations, and planar motions. In essence, the robot-mounted\ncamera looks to an unknown rigid layout, tracks points over an image sequence\nand estimates the camera-to-robot relationship. Such a self calibration process\nis relevant for unmanned vehicles, robots working in remote places, and so\nforth. We conduct a large number of experiments which validate the quality of\nthe method by comparing it with existing ones.\n","authors":["Nicolas Andreff","Bernard Espiau","Radu Horaud"],"pdf_url":"https://arxiv.org/pdf/2311.11808v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11797v1","updated":"2023-11-20T14:30:55Z","published":"2023-11-20T14:30:55Z","title":"Igniting Language Intelligence: The Hitchhiker's Guide From\n Chain-of-Thought Reasoning to Language Agents","summary":" Large language models (LLMs) have dramatically enhanced the field of language\nintelligence, as demonstrably evidenced by their formidable empirical\nperformance across a spectrum of complex reasoning tasks. Additionally,\ntheoretical proofs have illuminated their emergent reasoning capabilities,\nproviding a compelling showcase of their advanced cognitive abilities in\nlinguistic contexts. Critical to their remarkable efficacy in handling complex\nreasoning tasks, LLMs leverage the intriguing chain-of-thought (CoT) reasoning\ntechniques, obliging them to formulate intermediate steps en route to deriving\nan answer. The CoT reasoning approach has not only exhibited proficiency in\namplifying reasoning performance but also in enhancing interpretability,\ncontrollability, and flexibility. In light of these merits, recent research\nendeavors have extended CoT reasoning methodologies to nurture the development\nof autonomous language agents, which adeptly adhere to language instructions\nand execute actions within varied environments. This survey paper orchestrates\na thorough discourse, penetrating vital research dimensions, encompassing: (i)\nthe foundational mechanics of CoT techniques, with a focus on elucidating the\ncircumstances and justification behind its efficacy; (ii) the paradigm shift in\nCoT; and (iii) the burgeoning of language agents fortified by CoT approaches.\nProspective research avenues envelop explorations into generalization,\nefficiency, customization, scaling, and safety. This paper caters to a wide\naudience, including beginners seeking comprehensive knowledge of CoT reasoning\nand language agents, as well as experienced researchers interested in\nfoundational mechanics and engaging in cutting-edge discussions on these\ntopics. A repository for the related papers is available at\nhttps://github.com/Zoeyyao27/CoT-Igniting-Agent.\n","authors":["Zhuosheng Zhang","Yao Yao","Aston Zhang","Xiangru Tang","Xinbei Ma","Zhiwei He","Yiming Wang","Mark Gerstein","Rui Wang","Gongshen Liu","Hai Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.11797v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11796v1","updated":"2023-11-20T14:29:45Z","published":"2023-11-20T14:29:45Z","title":"Beyond Boundaries: A Comprehensive Survey of Transferable Attacks on AI\n Systems","summary":" Artificial Intelligence (AI) systems such as autonomous vehicles, facial\nrecognition, and speech recognition systems are increasingly integrated into\nour daily lives. However, despite their utility, these AI systems are\nvulnerable to a wide range of attacks such as adversarial, backdoor, data\npoisoning, membership inference, model inversion, and model stealing attacks.\nIn particular, numerous attacks are designed to target a particular model or\nsystem, yet their effects can spread to additional targets, referred to as\ntransferable attacks. Although considerable efforts have been directed toward\ndeveloping transferable attacks, a holistic understanding of the advancements\nin transferable attacks remains elusive. In this paper, we comprehensively\nexplore learning-based attacks from the perspective of transferability,\nparticularly within the context of cyber-physical security. We delve into\ndifferent domains -- the image, text, graph, audio, and video domains -- to\nhighlight the ubiquitous and pervasive nature of transferable attacks. This\npaper categorizes and reviews the architecture of existing attacks from various\nviewpoints: data, process, model, and system. We further examine the\nimplications of transferable attacks in practical scenarios such as autonomous\ndriving, speech recognition, and large language models (LLMs). Additionally, we\noutline the potential research directions to encourage efforts in exploring the\nlandscape of transferable attacks. This survey offers a holistic understanding\nof the prevailing transferable attacks and their impacts across different\ndomains.\n","authors":["Guangjing Wang","Ce Zhou","Yuanda Wang","Bocheng Chen","Hanqing Guo","Qiben Yan"],"pdf_url":"https://arxiv.org/pdf/2311.11796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.04514v2","updated":"2023-11-20T14:28:02Z","published":"2020-12-08T16:02:16Z","title":"Human Motion Tracking by Registering an Articulated Surface to 3-D\n Points and Normals","summary":" We address the problem of human motion tracking by registering a surface to\n3-D data. We propose a method that iteratively computes two things: Maximum\nlikelihood estimates for both the kinematic and free-motion parameters of a\nkinematic human-body representation, as well as probabilities that the data are\nassigned either to a body part, or to an outlier cluster. We introduce a new\nmetric between observed points and normals on one side, and a parameterized\nsurface on the other side, the latter being defined as a blending over a set of\nellipsoids. We claim that this metric is well suited when one deals with either\nvisual-hull or visual-shape observations. We illustrate the method by tracking\nhuman motions using sparse visual-shape data (3-D surface points and normals)\ngathered from imperfect silhouettes.\n","authors":["Radu Horaud","Matti Niskanen","Guillaume Dewaele","Edmond Boyer"],"pdf_url":"https://arxiv.org/pdf/2012.04514v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.05582v2","updated":"2023-11-20T14:17:15Z","published":"2020-12-10T11:03:25Z","title":"Image Matching with Scale Adjustment","summary":" In this paper we address the problem of matching two images with two\ndifferent resolutions: a high-resolution image and a low-resolution one. The\ndifference in resolution between the two images is not known and without loss\nof generality one of the images is assumed to be the high-resolution one. On\nthe premise that changes in resolution act as a smoothing equivalent to changes\nin scale, a scale-space representation of the high-resolution image is\nproduced. Hence the one-to-one classical image matching paradigm becomes\none-to-many because the low-resolution image is compared with all the\nscale-space representations of the high-resolution one. Key to the success of\nsuch a process is the proper representation of the features to be matched in\nscale-space. We show how to represent and extract interest points at variable\nscales and we devise a method allowing the comparison of two images at two\ndifferent resolutions. The method comprises the use of photometric- and\nrotation-invariant descriptors, a geometric model mapping the high-resolution\nimage onto a low-resolution image region, and an image matching strategy based\non local constraints and on the robust estimation of this geometric model.\nExtensive experiments show that our matching method can be used for scale\nchanges up to a factor of 6.\n","authors":["Yves Dufournaud","Cordelia Schmid","Radu Horaud"],"pdf_url":"https://arxiv.org/pdf/2012.05582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11782v1","updated":"2023-11-20T14:07:38Z","published":"2023-11-20T14:07:38Z","title":"Robust Tumor Segmentation with Hyperspectral Imaging and Graph Neural\n Networks","summary":" Segmenting the boundary between tumor and healthy tissue during surgical\ncancer resection poses a significant challenge. In recent years, Hyperspectral\nImaging (HSI) combined with Machine Learning (ML) has emerged as a promising\nsolution. However, due to the extensive information contained within the\nspectral domain, most ML approaches primarily classify individual HSI\n(super-)pixels, or tiles, without taking into account their spatial context. In\nthis paper, we propose an improved methodology that leverages the spatial\ncontext of tiles for more robust and smoother segmentation. To address the\nirregular shapes of tiles, we utilize Graph Neural Networks (GNNs) to propagate\ncontext information across neighboring regions. The features for each tile\nwithin the graph are extracted using a Convolutional Neural Network (CNN),\nwhich is trained simultaneously with the subsequent GNN. Moreover, we\nincorporate local image quality metrics into the loss function to enhance the\ntraining procedure's robustness against low-quality regions in the training\nimages. We demonstrate the superiority of our proposed method using a clinical\nex vivo dataset consisting of 51 HSI images from 30 patients. Despite the\nlimited dataset, the GNN-based model significantly outperforms context-agnostic\napproaches, accurately distinguishing between healthy and tumor tissues, even\nin images from previously unseen patients. Furthermore, we show that our\ncarefully designed loss function, accounting for local image quality, results\nin additional improvements. Our findings demonstrate that context-aware GNN\nalgorithms can robustly find tumor demarcations on HSI images, ultimately\ncontributing to better surgery success and patient outcome.\n","authors":["Mayar Lotfy","Anna Alperovich","Tommaso Giannantonio","Bjorn Barz","Xiaohan Zhang","Felix Holm","Nassir Navab","Felix Boehm","Carolin Schwamborn","Thomas K. Hoffmann","Patrick J. Schuler"],"pdf_url":"https://arxiv.org/pdf/2311.11782v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.11777v1","updated":"2023-11-20T14:02:50Z","published":"2023-11-20T14:02:50Z","title":"Multimodal deep learning for mapping forest dominant height by fusing\n GEDI with earth observation data","summary":" The integration of multisource remote sensing data and deep learning models\noffers new possibilities for accurately mapping high spatial resolution forest\nheight. We found that GEDI relative heights (RH) metrics exhibited strong\ncorrelation with the mean of the top 10 highest trees (dominant height)\nmeasured in situ at the corresponding footprint locations. Consequently, we\nproposed a novel deep learning framework termed the multi-modal attention\nremote sensing network (MARSNet) to estimate forest dominant height by\nextrapolating dominant height derived from GEDI, using Setinel-1 data, ALOS-2\nPALSAR-2 data, Sentinel-2 optical data and ancillary data. MARSNet comprises\nseparate encoders for each remote sensing data modality to extract multi-scale\nfeatures, and a shared decoder to fuse the features and estimate height. Using\nindividual encoders for each remote sensing imagery avoids interference across\nmodalities and extracts distinct representations. To focus on the efficacious\ninformation from each dataset, we reduced the prevalent spatial and band\nredundancies in each remote sensing data by incorporating the extended spatial\nand band reconstruction convolution modules in the encoders. MARSNet achieved\ncommendable performance in estimating dominant height, with an R2 of 0.62 and\nRMSE of 2.82 m, outperforming the widely used random forest approach which\nattained an R2 of 0.55 and RMSE of 3.05 m. Finally, we applied the trained\nMARSNet model to generate wall-to-wall maps at 10 m resolution for Jilin,\nChina. Through independent validation using field measurements, MARSNet\ndemonstrated an R2 of 0.58 and RMSE of 3.76 m, compared to 0.41 and 4.37 m for\nthe random forest baseline. Our research demonstrates the effectiveness of a\nmultimodal deep learning approach fusing GEDI with SAR and passive optical\nimagery for enhancing the accuracy of high resolution dominant height\nestimation.\n","authors":["Man Chen","Wenquan Dong","Hao Yu","Iain Woodhouse","Casey M. Ryan","Haoyu Liu","Selena Georgiou","Edward T. A. Mitchard"],"pdf_url":"https://arxiv.org/pdf/2311.11777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11773v1","updated":"2023-11-20T13:58:59Z","published":"2023-11-20T13:58:59Z","title":"Practical cross-sensor color constancy using a dual-mapping strategy","summary":" Deep Neural Networks (DNNs) have been widely used for illumination\nestimation, which is time-consuming and requires sensor-specific data\ncollection. Our proposed method uses a dual-mapping strategy and only requires\na simple white point from a test sensor under a D65 condition. This allows us\nto derive a mapping matrix, enabling the reconstructions of image data and\nilluminants. In the second mapping phase, we transform the re-constructed image\ndata into sparse features, which are then optimized with a lightweight\nmulti-layer perceptron (MLP) model using the re-constructed illuminants as\nground truths. This approach effectively reduces sensor discrepancies and\ndelivers performance on par with leading cross-sensor methods. It only requires\na small amount of memory (~0.003 MB), and takes ~1 hour training on an\nRTX3070Ti GPU. More importantly, the method can be implemented very fast, with\n~0.3 ms and ~1 ms on a GPU or CPU respectively, and is not sensitive to the\ninput image resolution. Therefore, it offers a practical solution to the great\nchallenges of data recollection that is faced by the industry.\n","authors":["Shuwei Yue","Minchen Wei"],"pdf_url":"https://arxiv.org/pdf/2311.11773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11772v1","updated":"2023-11-20T13:58:26Z","published":"2023-11-20T13:58:26Z","title":"A Good Feature Extractor Is All You Need for Weakly Supervised Learning\n in Histopathology","summary":" Deep learning is revolutionising pathology, offering novel opportunities in\ndisease prognosis and personalised treatment. Historically, stain normalisation\nhas been a crucial preprocessing step in computational pathology pipelines, and\npersists into the deep learning era. Yet, with the emergence of feature\nextractors trained using self-supervised learning (SSL) on diverse pathology\ndatasets, we call this practice into question. In an empirical evaluation of\npublicly available feature extractors, we find that omitting stain\nnormalisation and image augmentations does not compromise downstream\nperformance, while incurring substantial savings in memory and compute.\nFurther, we show that the top-performing feature extractors are remarkably\nrobust to variations in stain and augmentations like rotation in their latent\nspace. Contrary to previous patch-level benchmarking studies, our approach\nemphasises clinical relevance by focusing on slide-level prediction tasks in a\nweakly supervised setting with external validation cohorts. This work\nrepresents the most comprehensive robustness evaluation of public pathology SSL\nfeature extractors to date, involving more than 6,000 training runs across nine\ntasks, five datasets, three downstream architectures, and various preprocessing\nsetups. Our findings stand to streamline digital pathology workflows by\nminimising preprocessing needs and informing the selection of feature\nextractors.\n","authors":["Georg Wölflein","Dyke Ferber","Asier Rabasco Meneghetti","Omar S. M. El Nahhas","Daniel Truhn","Zunamys I. Carrero","David J. Harrison","Ognjen Arandjelović","Jakob N. Kather"],"pdf_url":"https://arxiv.org/pdf/2311.11772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11757v1","updated":"2023-11-20T13:34:51Z","published":"2023-11-20T13:34:51Z","title":"Non-Contact NIR PPG Sensing through Large Sequence Signal Regression","summary":" Non-Contact sensing is an emerging technology with applications across many\nindustries from driver monitoring in vehicles to patient monitoring in\nhealthcare. Current state-of-the-art implementations focus on RGB video, but\nthis struggles in varying/noisy light conditions and is almost completely\nunfeasible in the dark. Near Infra-Red (NIR) video, however, does not suffer\nfrom these constraints. This paper aims to demonstrate the effectiveness of an\nalternative Convolution Attention Network (CAN) architecture, to regress\nphotoplethysmography (PPG) signal from a sequence of NIR frames. A combination\nof two publicly available datasets, which is split into train and test sets, is\nused for training the CAN. This combined dataset is augmented to reduce\noverfitting to the 'normal' 60 - 80 bpm heart rate range by providing the full\nrange of heart rates along with corresponding videos for each subject. This\nCAN, when implemented over video cropped to the subject's head, achieved a Mean\nAverage Error (MAE) of just 0.99 bpm, proving its effectiveness on NIR video\nand the architecture's feasibility to regress an accurate signal output.\n","authors":["Timothy Hanley","Dara Golden","Robyn Maxwell","Ashkan Parsi","Joseph Lemley"],"pdf_url":"https://arxiv.org/pdf/2311.11757v1.pdf","comment":"4 pages, 3 figures, 3 tables, Irish Machine Vision and Image\n Processing Conference 2023"},{"id":"http://arxiv.org/abs/2309.08402v3","updated":"2023-11-20T13:31:42Z","published":"2023-09-15T13:54:48Z","title":"3D SA-UNet: 3D Spatial Attention UNet with 3D ASPP for White Matter\n Hyperintensities Segmentation","summary":" White Matter Hyperintensity (WMH) is an imaging feature related to various\ndiseases such as dementia and stroke. Accurately segmenting WMH using computer\ntechnology is crucial for early disease diagnosis. However, this task remains\nchallenging due to the small lesions with low contrast and high discontinuity\nin the images, which contain limited contextual and spatial information. To\naddress this challenge, we propose a deep learning model called 3D Spatial\nAttention U-Net (3D SA-UNet) for automatic WMH segmentation using only Fluid\nAttenuation Inversion Recovery (FLAIR) scans. The 3D SA-UNet introduces a 3D\nSpatial Attention Module that highlights important lesion features, such as\nWMH, while suppressing unimportant regions. Additionally, to capture features\nat different scales, we extend the Atrous Spatial Pyramid Pooling (ASPP) module\nto a 3D version, enhancing the segmentation performance of the network. We\nevaluate our method on publicly available dataset and demonstrate the\neffectiveness of 3D spatial attention module and 3D ASPP in WMH segmentation.\nThrough experimental results, it has been demonstrated that our proposed 3D\nSA-UNet model achieves higher accuracy compared to other state-of-the-art 3D\nconvolutional neural networks.\n","authors":["Changlu Guo"],"pdf_url":"https://arxiv.org/pdf/2309.08402v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11754v1","updated":"2023-11-20T13:30:42Z","published":"2023-11-20T13:30:42Z","title":"A Large-Scale Car Parts (LSCP) Dataset for Lightweight Fine-Grained\n Detection","summary":" Automotive related datasets have previously been used for training autonomous\ndriving systems or vehicle classification tasks. However, there is a lack of\ndatasets in the field of automotive AI for car parts detection, and most\navailable datasets are limited in size and scope, struggling to cover diverse\nscenarios. To address this gap, this paper presents a large-scale and\nfine-grained automotive dataset consisting of 84,162 images for detecting 12\ndifferent types of car parts. This dataset was collected from natural cameras\nand online websites which covers various car brands, scenarios, and shooting\nangles. To alleviate the burden of manual annotation, we propose a novel\nsemi-supervised auto-labeling method that leverages state-of-the-art\npre-trained detectors. Moreover, we study the limitations of the Grounding DINO\napproach for zero-shot labeling. Finally, we evaluate the effectiveness of our\nproposed dataset through fine-grained car parts detection by training several\nlightweight YOLO-series detectors.\n","authors":["Wang Jie","Zhong Yilin","Cao Qianqian"],"pdf_url":"https://arxiv.org/pdf/2311.11754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11753v1","updated":"2023-11-20T13:28:42Z","published":"2023-11-20T13:28:42Z","title":"AdvGen: Physical Adversarial Attack on Face Presentation Attack\n Detection Systems","summary":" Evaluating the risk level of adversarial images is essential for safely\ndeploying face authentication models in the real world. Popular approaches for\nphysical-world attacks, such as print or replay attacks, suffer from some\nlimitations, like including physical and geometrical artifacts. Recently,\nadversarial attacks have gained attraction, which try to digitally deceive the\nlearning strategy of a recognition system using slight modifications to the\ncaptured image. While most previous research assumes that the adversarial image\ncould be digitally fed into the authentication systems, this is not always the\ncase for systems deployed in the real world. This paper demonstrates the\nvulnerability of face authentication systems to adversarial images in physical\nworld scenarios. We propose AdvGen, an automated Generative Adversarial\nNetwork, to simulate print and replay attacks and generate adversarial images\nthat can fool state-of-the-art PADs in a physical domain attack setting. Using\nthis attack strategy, the attack success rate goes up to 82.01%. We test AdvGen\nextensively on four datasets and ten state-of-the-art PADs. We also demonstrate\nthe effectiveness of our attack by conducting experiments in a realistic,\nphysical environment.\n","authors":["Sai Amrit Patnaik","Shivali Chansoriya","Anil K. Jain","Anoop M. Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2311.11753v1.pdf","comment":"10 pages, 9 figures, Accepted to the International Joint Conference\n on Biometrics (IJCB 2023)"},{"id":"http://arxiv.org/abs/2311.11742v1","updated":"2023-11-20T13:09:11Z","published":"2023-11-20T13:09:11Z","title":"Fuzzy Information Seeded Region Growing for Automated Lesions After\n Stroke Segmentation in MR Brain Images","summary":" In the realm of medical imaging, precise segmentation of stroke lesions from\nbrain MRI images stands as a critical challenge with significant implications\nfor patient diagnosis and treatment. Addressing this, our study introduces an\ninnovative approach using a Fuzzy Information Seeded Region Growing (FISRG)\nalgorithm. Designed to effectively delineate the complex and irregular\nboundaries of stroke lesions, the FISRG algorithm combines fuzzy logic with\nSeeded Region Growing (SRG) techniques, aiming to enhance segmentation\naccuracy.\n The research involved three experiments to optimize the FISRG algorithm's\nperformance, each focusing on different parameters to improve the accuracy of\nstroke lesion segmentation. The highest Dice score achieved in these\nexperiments was 94.2\\%, indicating a high degree of similarity between the\nalgorithm's output and the expert-validated ground truth. Notably, the best\naverage Dice score, amounting to 88.1\\%, was recorded in the third experiment,\nhighlighting the efficacy of the algorithm in consistently segmenting stroke\nlesions across various slices.\n Our findings reveal the FISRG algorithm's strengths in handling the\nheterogeneity of stroke lesions. However, challenges remain in areas of abrupt\nlesion topology changes and in distinguishing lesions from similar intensity\nbrain regions. The results underscore the potential of the FISRG algorithm in\ncontributing significantly to advancements in medical imaging analysis for\nstroke diagnosis and treatment.\n","authors":["Mario Pascual González"],"pdf_url":"https://arxiv.org/pdf/2311.11742v1.pdf","comment":"10 pages, 14 figures. Associated code and data available at:\n https://github.com/Mawio02/FISRG-for-Automated-Lesion-After-Stroke-Segmentation-in-MRI"},{"id":"http://arxiv.org/abs/2311.01310v2","updated":"2023-11-20T13:08:27Z","published":"2023-11-02T15:24:23Z","title":"Scattering Vision Transformer: Spectral Mixing Matters","summary":" Vision transformers have gained significant attention and achieved\nstate-of-the-art performance in various computer vision tasks, including image\nclassification, instance segmentation, and object detection. However,\nchallenges remain in addressing attention complexity and effectively capturing\nfine-grained information within images. Existing solutions often resort to\ndown-sampling operations, such as pooling, to reduce computational cost.\nUnfortunately, such operations are non-invertible and can result in information\nloss. In this paper, we present a novel approach called Scattering Vision\nTransformer (SVT) to tackle these challenges. SVT incorporates a spectrally\nscattering network that enables the capture of intricate image details. SVT\novercomes the invertibility issue associated with down-sampling operations by\nseparating low-frequency and high-frequency components. Furthermore, SVT\nintroduces a unique spectral gating network utilizing Einstein multiplication\nfor token and channel mixing, effectively reducing complexity. We show that SVT\nachieves state-of-the-art performance on the ImageNet dataset with a\nsignificant reduction in a number of parameters and FLOPS. SVT shows 2\\%\nimprovement over LiTv2 and iFormer. SVT-H-S reaches 84.2\\% top-1 accuracy,\nwhile SVT-H-B reaches 85.2\\% (state-of-art for base versions) and SVT-H-L\nreaches 85.7\\% (again state-of-art for large versions). SVT also shows\ncomparable results in other vision tasks such as instance segmentation. SVT\nalso outperforms other transformers in transfer learning on standard datasets\nsuch as CIFAR10, CIFAR100, Oxford Flower, and Stanford Car datasets. The\nproject page is available on this\nwebpage.\\url{https://badripatro.github.io/svt/}.\n","authors":["Badri N. Patro","Vijay Srinivas Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2311.01310v2.pdf","comment":"Accepted @NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.02850v2","updated":"2023-11-20T13:04:59Z","published":"2023-06-05T13:00:44Z","title":"TRACE: 5D Temporal Regression of Avatars with Dynamic Cameras in 3D\n Environments","summary":" Although the estimation of 3D human pose and shape (HPS) is rapidly\nprogressing, current methods still cannot reliably estimate moving humans in\nglobal coordinates, which is critical for many applications. This is\nparticularly challenging when the camera is also moving, entangling human and\ncamera motion. To address these issues, we adopt a novel 5D representation\n(space, time, and identity) that enables end-to-end reasoning about people in\nscenes. Our method, called TRACE, introduces several novel architectural\ncomponents. Most importantly, it uses two new \"maps\" to reason about the 3D\ntrajectory of people over time in camera, and world, coordinates. An additional\nmemory unit enables persistent tracking of people even during long occlusions.\nTRACE is the first one-stage method to jointly recover and track 3D humans in\nglobal coordinates from dynamic cameras. By training it end-to-end, and using\nfull image information, TRACE achieves state-of-the-art performance on tracking\nand HPS benchmarks. The code and dataset are released for research purposes.\n","authors":["Yu Sun","Qian Bao","Wu Liu","Tao Mei","Michael J. Black"],"pdf_url":"https://arxiv.org/pdf/2306.02850v2.pdf","comment":"Project page: https://www.yusun.work/TRACE/TRACE.html"},{"id":"http://arxiv.org/abs/2311.11722v1","updated":"2023-11-20T12:37:58Z","published":"2023-11-20T12:37:58Z","title":"Sparse4D v3: Advancing End-to-End 3D Detection and Tracking","summary":" In autonomous driving perception systems, 3D detection and tracking are the\ntwo fundamental tasks. This paper delves deeper into this field, building upon\nthe Sparse4D framework. We introduce two auxiliary training tasks (Temporal\nInstance Denoising and Quality Estimation) and propose decoupled attention to\nmake structural improvements, leading to significant enhancements in detection\nperformance. Additionally, we extend the detector into a tracker using a\nstraightforward approach that assigns instance ID during inference, further\nhighlighting the advantages of query-based algorithms. Extensive experiments\nconducted on the nuScenes benchmark validate the effectiveness of the proposed\nimprovements. With ResNet50 as the backbone, we witnessed enhancements of\n3.0\\%, 2.2\\%, and 7.6\\% in mAP, NDS, and AMOTA, achieving 46.9\\%, 56.1\\%, and\n49.0\\%, respectively. Our best model achieved 71.9\\% NDS and 67.7\\% AMOTA on\nthe nuScenes test set. Code will be released at\n\\url{https://github.com/linxuewu/Sparse4D}.\n","authors":["Xuewu Lin","Zixiang Pei","Tianwei Lin","Lichao Huang","Zhizhong Su"],"pdf_url":"https://arxiv.org/pdf/2311.11722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18936v2","updated":"2023-11-20T12:35:55Z","published":"2023-10-29T08:50:27Z","title":"Adversarial Examples Are Not Real Features","summary":" The existence of adversarial examples has been a mystery for years and\nattracted much interest. A well-known theory by \\citet{ilyas2019adversarial}\nexplains adversarial vulnerability from a data perspective by showing that one\ncan extract non-robust features from adversarial examples and these features\nalone are useful for classification. However, the explanation remains quite\ncounter-intuitive since non-robust features are mostly noise features to\nhumans. In this paper, we re-examine the theory from a larger context by\nincorporating multiple learning paradigms. Notably, we find that contrary to\ntheir good usefulness under supervised learning, non-robust features attain\npoor usefulness when transferred to other self-supervised learning paradigms,\nsuch as contrastive learning, masked image modeling, and diffusion models. It\nreveals that non-robust features are not really as useful as robust or natural\nfeatures that enjoy good transferability between these paradigms. Meanwhile,\nfor robustness, we also show that naturally trained encoders from robust\nfeatures are largely non-robust under AutoAttack. Our cross-paradigm\nexamination suggests that the non-robust features are not really useful but\nmore like paradigm-wise shortcuts, and robust features alone might be\ninsufficient to attain reliable model robustness. Code is available at\n\\url{https://github.com/PKU-ML/AdvNotRealFeatures}.\n","authors":["Ang Li","Yifei Wang","Yiwen Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2310.18936v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.11714v1","updated":"2023-11-20T12:32:32Z","published":"2023-11-20T12:32:32Z","title":"On the Importance of Large Objects in CNN Based Object Detection\n Algorithms","summary":" Object detection models, a prominent class of machine learning algorithms,\naim to identify and precisely locate objects in images or videos. However, this\ntask might yield uneven performances sometimes caused by the objects sizes and\nthe quality of the images and labels used for training. In this paper, we\nhighlight the importance of large objects in learning features that are\ncritical for all sizes. Given these findings, we propose to introduce a\nweighting term into the training loss. This term is a function of the object\narea size. We show that giving more weight to large objects leads to improved\ndetection scores across all object sizes and so an overall improvement in\nObject Detectors performances (+2 p.p. of mAP on small objects, +2 p.p. on\nmedium and +4 p.p. on large on COCO val 2017 with InternImage-T). Additional\nexperiments and ablation studies with different models and on a different\ndataset further confirm the robustness of our findings.\n","authors":["Ahmed Ben Saad","Gabriele Facciolo","Axel Davy"],"pdf_url":"https://arxiv.org/pdf/2311.11714v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19130v2","updated":"2023-11-20T12:15:19Z","published":"2023-10-29T19:39:03Z","title":"Women Wearing Lipstick: Measuring the Bias Between an Object and Its\n Related Gender","summary":" In this paper, we investigate the impact of objects on gender bias in image\ncaptioning systems. Our results show that only gender-specific objects have a\nstrong gender bias (e.g., women-lipstick). In addition, we propose a visual\nsemantic-based gender score that measures the degree of bias and can be used as\na plug-in for any image captioning system. Our experiments demonstrate the\nutility of the gender score, since we observe that our score can measure the\nbias relation between a caption and its related gender; therefore, our score\ncan be used as an additional metric to the existing Object Gender Co-Occ\napproach. Code and data are publicly available at\n\\url{https://github.com/ahmedssabir/GenderScore}.\n","authors":["Ahmed Sabir","Lluís Padró"],"pdf_url":"https://arxiv.org/pdf/2310.19130v2.pdf","comment":"EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2311.11700v1","updated":"2023-11-20T12:08:23Z","published":"2023-11-20T12:08:23Z","title":"GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting","summary":" In this paper, we introduce $\\textbf{GS-SLAM}$ that first utilizes 3D\nGaussian representation in the Simultaneous Localization and Mapping (SLAM)\nsystem. It facilitates a better balance between efficiency and accuracy.\nCompared to recent SLAM methods employing neural implicit representations, our\nmethod utilizes a real-time differentiable splatting rendering pipeline that\noffers significant speedup to map optimization and RGB-D re-rendering.\nSpecifically, we propose an adaptive expansion strategy that adds new or\ndeletes noisy 3D Gaussian in order to efficiently reconstruct new observed\nscene geometry and improve the mapping of previously observed areas. This\nstrategy is essential to extend 3D Gaussian representation to reconstruct the\nwhole scene rather than synthesize a static object in existing methods.\nMoreover, in the pose tracking process, an effective coarse-to-fine technique\nis designed to select reliable 3D Gaussian representations to optimize camera\npose, resulting in runtime reduction and robust estimation. Our method achieves\ncompetitive performance compared with existing state-of-the-art real-time\nmethods on the Replica, TUM-RGBD datasets. The source code will be released\nupon acceptance.\n","authors":["Chi Yan","Delin Qu","Dong Wang","Dan Xu","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11700v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11697v1","updated":"2023-11-20T12:00:06Z","published":"2023-11-20T12:00:06Z","title":"Cut-and-Paste: Subject-Driven Video Editing with Attention Control","summary":" This paper presents a novel framework termed Cut-and-Paste for real-word\nsemantic video editing under the guidance of text prompt and additional\nreference image. While the text-driven video editing has demonstrated\nremarkable ability to generate highly diverse videos following given text\nprompts, the fine-grained semantic edits are hard to control by plain textual\nprompt only in terms of object details and edited region, and cumbersome long\ntext descriptions are usually needed for the task. We therefore investigate\nsubject-driven video editing for more precise control of both edited regions\nand background preservation, and fine-grained semantic generation. We achieve\nthis goal by introducing an reference image as supplementary input to the\ntext-driven video editing, which avoids racking your brain to come up with a\ncumbersome text prompt describing the detailed appearance of the object. To\nlimit the editing area, we refer to a method of cross attention control in\nimage editing and successfully extend it to video editing by fusing the\nattention map of adjacent frames, which strikes a balance between maintaining\nvideo background and spatio-temporal consistency. Compared with current\nmethods, the whole process of our method is like ``cut\" the source object to be\nedited and then ``paste\" the target object provided by reference image. We\ndemonstrate that our method performs favorably over prior arts for video\nediting under the guidance of text prompt and extra reference image, as\nmeasured by both quantitative and subjective evaluations.\n","authors":["Zhichao Zuo","Zhao Zhang","Yan Luo","Yang Zhao","Haijun Zhang","Yi Yang","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.11697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11695v1","updated":"2023-11-20T11:51:13Z","published":"2023-11-20T11:51:13Z","title":"Clarity ChatGPT: An Interactive and Adaptive Processing System for Image\n Restoration and Enhancement","summary":" The generalization capability of existing image restoration and enhancement\n(IRE) methods is constrained by the limited pre-trained datasets, making it\ndifficult to handle agnostic inputs such as different degradation levels and\nscenarios beyond their design scopes. Moreover, they are not equipped with\ninteractive mechanisms to consider user preferences or feedback, and their\nend-to-end settings cannot provide users with more choices. Faced with the\nabove-mentioned IRE method's limited performance and insufficient\ninteractivity, we try to solve it from the engineering and system framework\nlevels. Specifically, we propose Clarity ChatGPT-a transformative system that\ncombines the conversational intelligence of ChatGPT with multiple IRE methods.\nClarity ChatGPT can automatically detect image degradation types and select\nappropriate IRE methods to restore images, or iteratively generate satisfactory\nresults based on user feedback. Its innovative features include a CLIP-powered\ndetector for accurate degradation classification, no-reference image quality\nevaluation for performance evaluation, region-specific processing for precise\nenhancements, and advanced fusion techniques for optimal restoration results.\nClarity ChatGPT marks a significant advancement in integrating language and\nvision, enhancing image-text interactions, and providing a robust,\nhigh-performance IRE solution. Our case studies demonstrate that Clarity\nChatGPT effectively improves the generalization and interaction capabilities in\nthe IRE, and also fills the gap in the low-level domain of the existing\nvision-language model.\n","authors":["Yanyan Wei","Zhao Zhang","Jiahuan Ren","Xiaogang Xu","Richang Hong","Yi Yang","Shuicheng Yan","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11686v1","updated":"2023-11-20T11:35:52Z","published":"2023-11-20T11:35:52Z","title":"Segment Together: A Versatile Paradigm for Semi-Supervised Medical Image\n Segmentation","summary":" Annotation scarcity has become a major obstacle for training powerful\ndeep-learning models for medical image segmentation, restricting their\ndeployment in clinical scenarios. To address it, semi-supervised learning by\nexploiting abundant unlabeled data is highly desirable to boost the model\ntraining. However, most existing works still focus on limited medical tasks and\nunderestimate the potential of learning across diverse tasks and multiple\ndatasets. Therefore, in this paper, we introduce a \\textbf{Ver}satile\n\\textbf{Semi}-supervised framework (VerSemi) to point out a new perspective\nthat integrates various tasks into a unified model with a broad label space, to\nexploit more unlabeled data for semi-supervised medical image segmentation.\nSpecifically, we introduce a dynamic task-prompted design to segment various\ntargets from different datasets. Next, this unified model is used to identify\nthe foreground regions from all labeled data, to capture cross-dataset\nsemantics. Particularly, we create a synthetic task with a cutmix strategy to\naugment foreground targets within the expanded label space. To effectively\nutilize unlabeled data, we introduce a consistency constraint. This involves\naligning aggregated predictions from various tasks with those from the\nsynthetic task, further guiding the model in accurately segmenting foreground\nregions during training. We evaluated our VerSemi model on four public\nbenchmarking datasets. Extensive experiments demonstrated that VerSemi can\nconsistently outperform the second-best method by a large margin (e.g., an\naverage 2.69\\% Dice gain on four datasets), setting new SOTA performance for\nsemi-supervised medical image segmentation. The code will be released.\n","authors":["Qingjie Zeng","Yutong Xie","Zilin Lu","Mengkang Lu","Yicheng Wu","Yong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.11686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11683v1","updated":"2023-11-20T11:28:18Z","published":"2023-11-20T11:28:18Z","title":"ViP-Mixer: A Convolutional Mixer for Video Prediction","summary":" Video prediction aims to predict future frames from a video's previous\ncontent. Existing methods mainly process video data where the time dimension\nmingles with the space and channel dimensions from three distinct angles: as a\nsequence of individual frames, as a 3D volume in spatiotemporal coordinates, or\nas a stacked image where frames are treated as separate channels. Most of them\ngenerally focus on one of these perspectives and may fail to fully exploit the\nrelationships across different dimensions. To address this issue, this paper\nintroduces a convolutional mixer for video prediction, termed ViP-Mixer, to\nmodel the spatiotemporal evolution in the latent space of an autoencoder. The\nViP-Mixers are stacked sequentially and interleave feature mixing at three\nlevels: frames, channels, and locations. Extensive experiments demonstrate that\nour proposed method achieves new state-of-the-art prediction performance on\nthree benchmark video datasets covering both synthetic and real-world\nscenarios.\n","authors":["Xin Zheng","Ziang Peng","Yuan Cao","Hongming Shan","Junping Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.11683v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.11669v1","updated":"2023-11-20T11:09:09Z","published":"2023-11-20T11:09:09Z","title":"PMP-Swin: Multi-Scale Patch Message Passing Swin Transformer for Retinal\n Disease Classification","summary":" Retinal disease is one of the primary causes of visual impairment, and early\ndiagnosis is essential for preventing further deterioration. Nowadays, many\nworks have explored Transformers for diagnosing diseases due to their strong\nvisual representation capabilities. However, retinal diseases exhibit milder\nforms and often present with overlapping signs, which pose great difficulties\nfor accurate multi-class classification. Therefore, we propose a new framework\nnamed Multi-Scale Patch Message Passing Swin Transformer for multi-class\nretinal disease classification. Specifically, we design a Patch Message Passing\n(PMP) module based on the Message Passing mechanism to establish global\ninteraction for pathological semantic features and to exploit the subtle\ndifferences further between different diseases. Moreover, considering the\nvarious scale of pathological features we integrate multiple PMP modules for\ndifferent patch sizes. For evaluation, we have constructed a new dataset, named\nOPTOS dataset, consisting of 1,033 high-resolution fundus images photographed\nby Optos camera and conducted comprehensive experiments to validate the\nefficacy of our proposed method. And the results on both the public dataset and\nour dataset demonstrate that our method achieves remarkable performance\ncompared to state-of-the-art methods.\n","authors":["Zhihan Yang","Zhiming Cheng","Tengjin Weng","Shucheng He","Yaqi Wang","Xin Ye","Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2311.11669v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.11666v1","updated":"2023-11-20T11:04:59Z","published":"2023-11-20T11:04:59Z","title":"OmniSeg3D: Omniversal 3D Segmentation via Hierarchical Contrastive\n Learning","summary":" Towards holistic understanding of 3D scenes, a general 3D segmentation method\nis needed that can segment diverse objects without restrictions on object\nquantity or categories, while also reflecting the inherent hierarchical\nstructure. To achieve this, we propose OmniSeg3D, an omniversal segmentation\nmethod aims for segmenting anything in 3D all at once. The key insight is to\nlift multi-view inconsistent 2D segmentations into a consistent 3D feature\nfield through a hierarchical contrastive learning framework, which is\naccomplished by two steps. Firstly, we design a novel hierarchical\nrepresentation based on category-agnostic 2D segmentations to model the\nmulti-level relationship among pixels. Secondly, image features rendered from\nthe 3D feature field are clustered at different levels, which can be further\ndrawn closer or pushed apart according to the hierarchical relationship between\ndifferent levels. In tackling the challenges posed by inconsistent 2D\nsegmentations, this framework yields a global consistent 3D feature field,\nwhich further enables hierarchical segmentation, multi-object selection, and\nglobal discretization. Extensive experiments demonstrate the effectiveness of\nour method on high-quality 3D segmentation and accurate hierarchical structure\nunderstanding. A graphical user interface further facilitates flexible\ninteraction for omniversal 3D segmentation.\n","authors":["Haiyang Ying","Yixuan Yin","Jinzhi Zhang","Fan Wang","Tao Yu","Ruqi Huang","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2311.11666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10373v3","updated":"2023-11-20T10:54:09Z","published":"2023-07-19T18:00:03Z","title":"TokenFlow: Consistent Diffusion Features for Consistent Video Editing","summary":" The generative AI revolution has recently expanded to videos. Nevertheless,\ncurrent state-of-the-art video models are still lagging behind image models in\nterms of visual quality and user control over the generated content. In this\nwork, we present a framework that harnesses the power of a text-to-image\ndiffusion model for the task of text-driven video editing. Specifically, given\na source video and a target text-prompt, our method generates a high-quality\nvideo that adheres to the target text, while preserving the spatial layout and\nmotion of the input video. Our method is based on a key observation that\nconsistency in the edited video can be obtained by enforcing consistency in the\ndiffusion feature space. We achieve this by explicitly propagating diffusion\nfeatures based on inter-frame correspondences, readily available in the model.\nThus, our framework does not require any training or fine-tuning, and can work\nin conjunction with any off-the-shelf text-to-image editing method. We\ndemonstrate state-of-the-art editing results on a variety of real-world videos.\nWebpage: https://diffusion-tokenflow.github.io/\n","authors":["Michal Geyer","Omer Bar-Tal","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2307.10373v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11662v1","updated":"2023-11-20T10:53:59Z","published":"2023-11-20T10:53:59Z","title":"Enhanced Spatio-Temporal Context for Temporally Consistent Robust 3D\n Human Motion Recovery from Monocular Videos","summary":" Recovering temporally consistent 3D human body pose, shape and motion from a\nmonocular video is a challenging task due to (self-)occlusions, poor lighting\nconditions, complex articulated body poses, depth ambiguity, and limited\navailability of annotated data. Further, doing a simple perframe estimation is\ninsufficient as it leads to jittery and implausible results. In this paper, we\npropose a novel method for temporally consistent motion estimation from a\nmonocular video. Instead of using generic ResNet-like features, our method uses\na body-aware feature representation and an independent per-frame pose and\ncamera initialization over a temporal window followed by a novel\nspatio-temporal feature aggregation by using a combination of self-similarity\nand self-attention over the body-aware features and the perframe\ninitialization. Together, they yield enhanced spatiotemporal context for every\nframe by considering remaining past and future frames. These features are used\nto predict the pose and shape parameters of the human body model, which are\nfurther refined using an LSTM. Experimental results on the publicly available\nbenchmark data show that our method attains significantly lower acceleration\nerror and outperforms the existing state-of-the-art methods over all key\nquantitative evaluation metrics, including complex scenarios like partial\nocclusion, complex poses and even relatively low illumination.\n","authors":["Sushovan Chanda","Amogh Tiwari","Lokender Tiwari","Brojeshwar Bhowmick","Avinash Sharma","Hrishav Barua"],"pdf_url":"https://arxiv.org/pdf/2311.11662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11659v1","updated":"2023-11-20T10:49:32Z","published":"2023-11-20T10:49:32Z","title":"MGCT: Mutual-Guided Cross-Modality Transformer for Survival Outcome\n Prediction using Integrative Histopathology-Genomic Features","summary":" The rapidly emerging field of deep learning-based computational pathology has\nshown promising results in utilizing whole slide images (WSIs) to objectively\nprognosticate cancer patients. However, most prognostic methods are currently\nlimited to either histopathology or genomics alone, which inevitably reduces\ntheir potential to accurately predict patient prognosis. Whereas integrating\nWSIs and genomic features presents three main challenges: (1) the enormous\nheterogeneity of gigapixel WSIs which can reach sizes as large as\n150,000x150,000 pixels; (2) the absence of a spatially corresponding\nrelationship between histopathology images and genomic molecular data; and (3)\nthe existing early, late, and intermediate multimodal feature fusion strategies\nstruggle to capture the explicit interactions between WSIs and genomics. To\nameliorate these issues, we propose the Mutual-Guided Cross-Modality\nTransformer (MGCT), a weakly-supervised, attention-based multimodal learning\nframework that can combine histology features and genomic features to model the\ngenotype-phenotype interactions within the tumor microenvironment. To validate\nthe effectiveness of MGCT, we conduct experiments using nearly 3,600 gigapixel\nWSIs across five different cancer types sourced from The Cancer Genome Atlas\n(TCGA). Extensive experimental results consistently emphasize that MGCT\noutperforms the state-of-the-art (SOTA) methods.\n","authors":["Mingxin Liu","Yunzan Liu","Hui Cui","Chunquan Li","Jiquan Ma"],"pdf_url":"https://arxiv.org/pdf/2311.11659v1.pdf","comment":"7 pages, 4 figures, accepted by 2023 IEEE International Conference on\n Bioinformatics and Biomedicine (BIBM 2023)"},{"id":"http://arxiv.org/abs/2311.11656v1","updated":"2023-11-20T10:45:39Z","published":"2023-11-20T10:45:39Z","title":"Double-Condensing Attention Condenser: Leveraging Attention in Deep\n Learning to Detect Skin Cancer from Skin Lesion Images","summary":" Skin cancer is the most common type of cancer in the United States and is\nestimated to affect one in five Americans. Recent advances have demonstrated\nstrong performance on skin cancer detection, as exemplified by state of the art\nperformance in the SIIM-ISIC Melanoma Classification Challenge; however these\nsolutions leverage ensembles of complex deep neural architectures requiring\nimmense storage and compute costs, and therefore may not be tractable. A recent\nmovement for TinyML applications is integrating Double-Condensing Attention\nCondensers (DC-AC) into a self-attention neural network backbone architecture\nto allow for faster and more efficient computation. This paper explores\nleveraging an efficient self-attention structure to detect skin cancer in skin\nlesion images and introduces a deep neural network design with DC-AC customized\nfor skin cancer detection from skin lesion images. The final model is publicly\navailable as a part of a global open-source initiative dedicated to\naccelerating advancement in machine learning to aid clinicians in the fight\nagainst cancer.\n","authors":["Chi-en Amy Tai","Elizabeth Janes","Chris Czarnecki","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2311.11656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11647v1","updated":"2023-11-20T10:28:52Z","published":"2023-11-20T10:28:52Z","title":"Cancer-Net PCa-Data: An Open-Source Benchmark Dataset for Prostate\n Cancer Clinical Decision Support using Synthetic Correlated Diffusion Imaging\n Data","summary":" The recent introduction of synthetic correlated diffusion (CDI$^s$) imaging\nhas demonstrated significant potential in the realm of clinical decision\nsupport for prostate cancer (PCa). CDI$^s$ is a new form of magnetic resonance\nimaging (MRI) designed to characterize tissue characteristics through the joint\ncorrelation of diffusion signal attenuation across different Brownian motion\nsensitivities. Despite the performance improvement, the CDI$^s$ data for PCa\nhas not been previously made publicly available. In our commitment to advance\nresearch efforts for PCa, we introduce Cancer-Net PCa-Data, an open-source\nbenchmark dataset of volumetric CDI$^s$ imaging data of PCa patients.\nCancer-Net PCa-Data consists of CDI$^s$ volumetric images from a patient cohort\nof 200 patient cases, along with full annotations (gland masks, tumor masks,\nand PCa diagnosis for each tumor). We also analyze the demographic and label\nregion diversity of Cancer-Net PCa-Data for potential biases. Cancer-Net\nPCa-Data is the first-ever public dataset of CDI$^s$ imaging data for PCa, and\nis a part of the global open-source initiative dedicated to advancement in\nmachine learning and imaging research to aid clinicians in the global fight\nagainst cancer.\n","authors":["Hayden Gunraj","Chi-en Amy Tai","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2311.11647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11646v1","updated":"2023-11-20T10:26:04Z","published":"2023-11-20T10:26:04Z","title":"CastDet: Toward Open Vocabulary Aerial Object Detection with\n CLIP-Activated Student-Teacher Learning","summary":" Object detection in aerial images is a pivotal task for various earth\nobservation applications, whereas current algorithms learn to detect only a\npre-defined set of object categories demanding sufficient bounding-box\nannotated training samples and fail to detect novel object categories. In this\npaper, we consider open-vocabulary object detection (OVD) in aerial images that\nenables the characterization of new objects beyond training categories on the\nearth surface without annotating training images for these new categories. The\nperformance of OVD depends on the quality of class-agnostic region proposals\nand pseudo-labels that can generalize well to novel object categories. To\nsimultaneously generate high-quality proposals and pseudo-labels, we propose\nCastDet, a CLIP-activated student-teacher open-vocabulary object Detection\nframework. Our end-to-end framework within the student-teacher mechanism\nemploys the CLIP model as an extra omniscient teacher of rich knowledge into\nthe student-teacher self-learning process. By doing so, our approach boosts\nnovel object proposals and classification. Furthermore, we design a dynamic\nlabel queue technique to maintain high-quality pseudo labels during batch\ntraining and mitigate label imbalance. We conduct extensive experiments on\nmultiple existing aerial object detection datasets, which are set up for the\nOVD task. Experimental results demonstrate our CastDet achieving superior\nopen-vocabulary detection performance, e.g., reaching 40.0 HM (Harmonic Mean),\nwhich outperforms previous methods Detic/ViLD by 26.9/21.1 on the VisDroneZSD\ndataset.\n","authors":["Yan Li","Weiwei Guo","Dunyun He","Jiaqi Zhou","Yuze Gao","Wenxian Yu"],"pdf_url":"https://arxiv.org/pdf/2311.11646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12634v2","updated":"2023-11-20T10:06:03Z","published":"2023-08-24T08:19:15Z","title":"Towards Hierarchical Regional Transformer-based Multiple Instance\n Learning","summary":" The classification of gigapixel histopathology images with deep multiple\ninstance learning models has become a critical task in digital pathology and\nprecision medicine. In this work, we propose a Transformer-based multiple\ninstance learning approach that replaces the traditional learned attention\nmechanism with a regional, Vision Transformer inspired self-attention\nmechanism. We present a method that fuses regional patch information to derive\nslide-level predictions and show how this regional aggregation can be stacked\nto hierarchically process features on different distance levels. To increase\npredictive accuracy, especially for datasets with small, local morphological\nfeatures, we introduce a method to focus the image processing on high attention\nregions during inference. Our approach is able to significantly improve\nperformance over the baseline on two histopathology datasets and points towards\npromising directions for further research.\n","authors":["Josef Cersovsky","Sadegh Mohammadi","Dagmar Kainmueller","Johannes Hoehne"],"pdf_url":"https://arxiv.org/pdf/2308.12634v2.pdf","comment":"8 pages, LaTeX; header update after published, fixed typos"},{"id":"http://arxiv.org/abs/2311.11642v1","updated":"2023-11-20T10:01:13Z","published":"2023-11-20T10:01:13Z","title":"Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging","summary":" Video face re-aging deals with altering the apparent age of a person to the\ntarget age in videos. This problem is challenging due to the lack of paired\nvideo datasets maintaining temporal consistency in identity and age. Most\nre-aging methods process each image individually without considering the\ntemporal consistency of videos. While some existing works address the issue of\ntemporal coherence through video facial attribute manipulation in latent space,\nthey often fail to deliver satisfactory performance in age transformation. To\ntackle the issues, we propose (1) a novel synthetic video dataset that features\nsubjects across a diverse range of age groups; (2) a baseline architecture\ndesigned to validate the effectiveness of our proposed dataset, and (3) the\ndevelopment of three novel metrics tailored explicitly for evaluating the\ntemporal consistency of video re-aging techniques. Our comprehensive\nexperiments on public datasets, such as VFHQ and CelebV-HQ, show that our\nmethod outperforms the existing approaches in terms of both age transformation\nand temporal consistency.\n","authors":["Abdul Muqeet","Kyuchul Lee","Bumsoo Kim","Yohan Hong","Hyungrae Lee","Woonggon Kim","Kwang Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11642v1.pdf","comment":"8 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.11638v1","updated":"2023-11-20T09:55:06Z","published":"2023-11-20T09:55:06Z","title":"Reti-Diff: Illumination Degradation Image Restoration with Retinex-based\n Latent Diffusion Model","summary":" Illumination degradation image restoration (IDIR) techniques aim to improve\nthe visibility of degraded images and mitigate the adverse effects of\ndeteriorated illumination. Among these algorithms, diffusion model (DM)-based\nmethods have shown promising performance but are often burdened by heavy\ncomputational demands and pixel misalignment issues when predicting the\nimage-level distribution. To tackle these problems, we propose to leverage DM\nwithin a compact latent space to generate concise guidance priors and introduce\na novel solution called Reti-Diff for the IDIR task. Reti-Diff comprises two\nkey components: the Retinex-based latent DM (RLDM) and the Retinex-guided\ntransformer (RGformer). To ensure detailed reconstruction and illumination\ncorrection, RLDM is empowered to acquire Retinex knowledge and extract\nreflectance and illumination priors. These priors are subsequently utilized by\nRGformer to guide the decomposition of image features into their respective\nreflectance and illumination components. Following this, RGformer further\nenhances and consolidates the decomposed features, resulting in the production\nof refined images with consistent content and robustness to handle complex\ndegradation scenarios. Extensive experiments show that Reti-Diff outperforms\nexisting methods on three IDIR tasks, as well as downstream applications. Code\nwill be available at \\url{https://github.com/ChunmingHe/Reti-Diff}.\n","authors":["Chunming He","Chengyu Fang","Yulun Zhang","Kai Li","Longxiang Tang","Chenyu You","Fengyang Xiao","Zhenhua Guo","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2311.11638v1.pdf","comment":"12 pages, 6 figures, 9 tables"},{"id":"http://arxiv.org/abs/2311.10543v2","updated":"2023-11-20T09:50:24Z","published":"2023-11-17T14:10:55Z","title":"Joint covariance property under geometric image transformations for\n spatio-temporal receptive fields according to the generalized Gaussian\n derivative model for visual receptive fields","summary":" The influence of natural image transformations on receptive field responses\nis crucial for modelling visual operations in computer vision and biological\nvision. In this regard, covariance properties with respect to geometric image\ntransformations in the earliest layers of the visual hierarchy are essential\nfor expressing robust image operations and for formulating invariant visual\noperations at higher levels. This paper defines and proves a joint covariance\nproperty under compositions of spatial scaling transformations, spatial affine\ntransformations, Galilean transformations and temporal scaling transformations,\nwhich makes it possible to characterize how different types of image\ntransformations interact with each other. Specifically, the derived relations\nshow how the receptive field parameters need to be transformed, in order to\nmatch the output from spatio-temporal receptive fields with the underlying\nspatio-temporal image transformations.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.10543v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2309.07510v4","updated":"2023-11-20T09:47:00Z","published":"2023-09-14T08:24:32Z","title":"Learning Environment-Aware Affordance for 3D Articulated Object\n Manipulation under Occlusions","summary":" Perceiving and manipulating 3D articulated objects in diverse environments is\nessential for home-assistant robots. Recent studies have shown that point-level\naffordance provides actionable priors for downstream manipulation tasks.\nHowever, existing works primarily focus on single-object scenarios with\nhomogeneous agents, overlooking the realistic constraints imposed by the\nenvironment and the agent's morphology, e.g., occlusions and physical\nlimitations. In this paper, we propose an environment-aware affordance\nframework that incorporates both object-level actionable priors and environment\nconstraints. Unlike object-centric affordance approaches, learning\nenvironment-aware affordance faces the challenge of combinatorial explosion due\nto the complexity of various occlusions, characterized by their quantities,\ngeometries, positions and poses. To address this and enhance data efficiency,\nwe introduce a novel contrastive affordance learning framework capable of\ntraining on scenes containing a single occluder and generalizing to scenes with\ncomplex occluder combinations. Experiments demonstrate the effectiveness of our\nproposed approach in learning affordance considering environment constraints.\nProject page at https://chengkaiacademycity.github.io/EnvAwareAfford/\n","authors":["Kai Cheng","Ruihai Wu","Yan Shen","Chuanruo Ning","Guanqi Zhan","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2309.07510v4.pdf","comment":"In 37th Conference on Neural Information Processing Systems (NeurIPS\n 2023). Website at https://chengkaiacademycity.github.io/EnvAwareAfford/"},{"id":"http://arxiv.org/abs/2311.11629v1","updated":"2023-11-20T09:28:04Z","published":"2023-11-20T09:28:04Z","title":"Generating Realistic Counterfactuals for Retinal Fundus and OCT Images\n using Diffusion Models","summary":" Counterfactual reasoning is often used in a clinical setting to explain\ndecisions or weigh alternatives. Therefore, for imaging based modalities such\nas ophthalmology, it would be beneficial to be able to create counterfactual\nimages, illustrating the answer to the question: \"If the subject had had\ndiabetic retinopathy, how would the fundus image have looked?\" Here, we\ndemonstrate that using a diffusion model in combination with an adversarially\nrobust classifier trained on retinal disease classification tasks enables\ngeneration of highly realistic counterfactuals of retinal fundus images and\noptical coherence tomorgraphy (OCT) B-scans. Ideally, these classifiers encode\nthe salient features indicative for each disease class and can steer the\ndiffusion model to show realistic disease signs or remove disease-related\nlesions in a realistic way. Importantly, in a user study, domain experts found\nthe counterfactuals generated using our method significantly more realistic\nthan counterfactuals generated from a previous method, and even\nindistiguishable from realistic images.\n","authors":["Indu Ilanchezian","Valentyn Boreiko","Laura Kühlewein","Ziwei Huang","Murat Seçkin Ayhan","Matthias Hein","Lisa Koch","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2311.11629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02421v2","updated":"2023-11-20T09:08:42Z","published":"2023-07-05T16:43:56Z","title":"DragonDiffusion: Enabling Drag-style Manipulation on Diffusion Models","summary":" Despite the ability of existing large-scale text-to-image (T2I) models to\ngenerate high-quality images from detailed textual descriptions, they often\nlack the ability to precisely edit the generated or real images. In this paper,\nwe propose a novel image editing method, DragonDiffusion, enabling Drag-style\nmanipulation on Diffusion models. Specifically, we construct classifier\nguidance based on the strong correspondence of intermediate features in the\ndiffusion model. It can transform the editing signals into gradients via\nfeature correspondence loss to modify the intermediate representation of the\ndiffusion model. Based on this guidance strategy, we also build a multi-scale\nguidance to consider both semantic and geometric alignment. Moreover, a\ncross-branch self-attention is added to maintain the consistency between the\noriginal image and the editing result. Our method, through an efficient design,\nachieves various editing modes for the generated or real images, such as object\nmoving, object resizing, object appearance replacement, and content dragging.\nIt is worth noting that all editing and content preservation signals come from\nthe image itself, and the model does not require fine-tuning or additional\nmodules. Our source code will be available at\nhttps://github.com/MC-E/DragonDiffusion.\n","authors":["Chong Mou","Xintao Wang","Jiechong Song","Ying Shan","Jian Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.02421v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11887v2","updated":"2023-11-20T08:57:58Z","published":"2023-08-23T03:20:31Z","title":"A Unified Framework for 3D Point Cloud Visual Grounding","summary":" Thanks to its precise spatial referencing, 3D point cloud visual grounding is\nessential for deep understanding and dynamic interaction in 3D environments,\nencompassing 3D Referring Expression Comprehension (3DREC) and Segmentation\n(3DRES). We argue that 3DREC and 3DRES should be unified in one framework,\nwhich is also a natural progression in the community. To explain, 3DREC help\n3DRES locate the referent, while 3DRES also facilitate 3DREC via more\nfine-grained language-visual alignment. To achieve this, this paper takes the\ninitiative step to integrate 3DREC and 3DRES into a unified framework, termed\n3D Referring Transformer (3DRefTR). Its key idea is to build upon a mature\n3DREC model and leverage ready query embeddings and visual tokens from the\n3DREC model to construct a dedicated mask branch. Specially, we propose\nSuperpoint Mask Branch, which serves a dual purpose: i) By harnessing on the\ninherent association between the superpoints and point cloud, it eliminates the\nheavy computational overhead on the high-resolution visual features for\nupsampling; ii) By leveraging the heterogeneous CPU-GPU parallelism, while the\nGPU is occupied generating visual and language tokens, the CPU concurrently\nproduces superpoints, equivalently accomplishing the upsampling computation.\nThis elaborate design enables 3DRefTR to achieve both well-performing 3DRES and\n3DREC capacities with only a 6% additional latency compared to the original\n3DREC model. Empirical evaluations affirm the superiority of 3DRefTR.\nSpecifically, on the ScanRefer dataset, 3DRefTR surpasses the state-of-the-art\n3DRES method by 12.43% in mIoU and improves upon the SOTA 3DREC method by 0.6%\nAcc@0.25IoU. The codes and models will be released soon.\n","authors":["Haojia Lin","Yongdong Luo","Xiawu Zheng","Lijiang Li","Fei Chao","Taisong Jin","Donghao Luo","Yan Wang","Liujuan Cao","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.11887v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11614v1","updated":"2023-11-20T08:56:51Z","published":"2023-11-20T08:56:51Z","title":"Semantic-Preserved Point-based Human Avatar","summary":" To enable realistic experience in AR/VR and digital entertainment, we present\nthe first point-based human avatar model that embodies the entirety expressive\nrange of digital humans. We employ two MLPs to model pose-dependent deformation\nand linear skinning (LBS) weights. The representation of appearance relies on a\ndecoder and the features that attached to each point. In contrast to\nalternative implicit approaches, the oriented points representation not only\nprovides a more intuitive way to model human avatar animation but also\nsignificantly reduces both training and inference time. Moreover, we propose a\nnovel method to transfer semantic information from the SMPL-X model to the\npoints, which enables to better understand human body movements. By leveraging\nthe semantic information of points, we can facilitate virtual try-on and human\navatar composition through exchanging the points of same category across\ndifferent subjects. Experimental results demonstrate the efficacy of our\npresented method.\n","authors":["Lixiang Lin","Jianke Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.11614v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.00851v3","updated":"2023-11-20T08:50:33Z","published":"2021-04-02T02:10:32Z","title":"Estimating the Generalization in Deep Neural Networks via Sparsity","summary":" Generalization is the key capability for deep neural networks (DNNs).\nHowever, it is challenging to give a reliable measure of the generalization\nability of a DNN via only its nature. In this paper, we propose a novel method\nfor estimating the generalization gap based on network sparsity. In our method,\ntwo key quantities are proposed first. They have close relationship with the\ngeneralization ability and can be calculated directly from the training results\nalone. Then a simple linear model involving two key quantities are constructed\nto give accurate estimation of the generalization gap. By training DNNs with a\nwide range of generalization gap on popular datasets, we show that our key\nquantities and linear model could be efficient tools for estimating the\ngeneralization gap of DNNs.\n","authors":["Yang Zhao","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2104.00851v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.00789v2","updated":"2023-11-20T08:48:07Z","published":"2020-11-02T07:37:31Z","title":"Role Taxonomy of Units in Deep Neural Networks","summary":" Identifying the role of network units in deep neural networks (DNNs) is\ncritical in many aspects including giving understandings on the mechanisms of\nDNNs and building basic connections between deep learning and neuroscience.\nHowever, there remains unclear on which roles the units in DNNs with different\ngeneralization ability could present. To this end, we give role taxonomy of\nunits in DNNs via introducing the retrieval-of-function test, where units are\ncategorized into four types in terms of their functional preference on\nseparately the training set and testing set. We show that ratios of the four\ncategories are highly associated with the generalization ability of DNNs from\ntwo distinct perspectives, based on which we give signs of DNNs with well\ngeneralization.\n","authors":["Yang Zhao","Hao Zhang","Xiuyuan Hu"],"pdf_url":"https://arxiv.org/pdf/2011.00789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12685v2","updated":"2023-11-20T08:46:28Z","published":"2023-06-22T06:12:23Z","title":"Rethinking the Backward Propagation for Adversarial Transferability","summary":" Transfer-based attacks generate adversarial examples on the surrogate model,\nwhich can mislead other black-box models without access, making it promising to\nattack real-world applications. Recently, several works have been proposed to\nboost adversarial transferability, in which the surrogate model is usually\noverlooked. In this work, we identify that non-linear layers (e.g., ReLU,\nmax-pooling, etc.) truncate the gradient during backward propagation, making\nthe gradient w.r.t. input image imprecise to the loss function. We hypothesize\nand empirically validate that such truncation undermines the transferability of\nadversarial examples. Based on these findings, we propose a novel method called\nBackward Propagation Attack (BPA) to increase the relevance between the\ngradient w.r.t. input image and loss function so as to generate adversarial\nexamples with higher transferability. Specifically, BPA adopts a non-monotonic\nfunction as the derivative of ReLU and incorporates softmax with temperature to\nsmooth the derivative of max-pooling, thereby mitigating the information loss\nduring the backward propagation of gradients. Empirical results on the ImageNet\ndataset demonstrate that not only does our method substantially boost the\nadversarial transferability, but it is also general to existing transfer-based\nattacks. Code is available at https://github.com/Trustworthy-AI-Group/RPA.\n","authors":["Xiaosen Wang","Kangheng Tong","Kun He"],"pdf_url":"https://arxiv.org/pdf/2306.12685v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2302.06494v3","updated":"2023-11-20T08:44:23Z","published":"2023-02-13T16:19:54Z","title":"Explicit3D: Graph Network with Spatial Inference for Single Image 3D\n Object Detection","summary":" Indoor 3D object detection is an essential task in single image scene\nunderstanding, impacting spatial cognition fundamentally in visual reasoning.\nExisting works on 3D object detection from a single image either pursue this\ngoal through independent predictions of each object or implicitly reason over\nall possible objects, failing to harness relational geometric information\nbetween objects. To address this problem, we propose a dynamic sparse graph\npipeline named Explicit3D based on object geometry and semantics features.\nTaking the efficiency into consideration, we further define a relatedness score\nand design a novel dynamic pruning algorithm followed by a cluster sampling\nmethod for sparse scene graph generation and updating. Furthermore, our\nExplicit3D introduces homogeneous matrices and defines new relative loss and\ncorner loss to model the spatial difference between target pairs explicitly.\nInstead of using ground-truth labels as direct supervision, our relative and\ncorner loss are derived from the homogeneous transformation, which renders the\nmodel to learn the geometric consistency between objects. The experimental\nresults on the SUN RGB-D dataset demonstrate that our Explicit3D achieves\nbetter performance balance than the-state-of-the-art.\n","authors":["Yanjun Liu","Wenming Yang"],"pdf_url":"https://arxiv.org/pdf/2302.06494v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11604v1","updated":"2023-11-20T08:40:01Z","published":"2023-11-20T08:40:01Z","title":"CurriculumLoc: Enhancing Cross-Domain Geolocalization through\n Multi-Stage Refinement","summary":" Visual geolocalization is a cost-effective and scalable task that involves\nmatching one or more query images, taken at some unknown location, to a set of\ngeo-tagged reference images. Existing methods, devoted to semantic features\nrepresentation, evolving towards robustness to a wide variety between query and\nreference, including illumination and viewpoint changes, as well as scale and\nseasonal variations. However, practical visual geolocalization approaches need\nto be robust in appearance changing and extreme viewpoint variation conditions,\nwhile providing accurate global location estimates. Therefore, inspired by\ncurriculum design, human learn general knowledge first and then delve into\nprofessional expertise. We first recognize semantic scene and then measure\ngeometric structure. Our approach, termed CurriculumLoc, involves a delicate\ndesign of multi-stage refinement pipeline and a novel keypoint detection and\ndescription with global semantic awareness and local geometric verification. We\nrerank candidates and solve a particular cross-domain perspective-n-point (PnP)\nproblem based on these keypoints and corresponding descriptors, position\nrefinement occurs incrementally. The extensive experimental results on our\ncollected dataset, TerraTrack and a benchmark dataset, ALTO, demonstrate that\nour approach results in the aforementioned desirable characteristics of a\npractical visual geolocalization solution. Additionally, we achieve new high\nrecall@1 scores of 62.6% and 94.5% on ALTO, with two different distances\nmetrics, respectively. Dataset, code and trained models are publicly available\non https://github.com/npupilab/CurriculumLoc.\n","authors":["Boni Hu","Lin Chen","Runjian Chen","Shuhui Bu","Pengcheng Han","Haowei Li"],"pdf_url":"https://arxiv.org/pdf/2311.11604v1.pdf","comment":"14 pages, 15 figures"},{"id":"http://arxiv.org/abs/2311.11602v1","updated":"2023-11-20T08:29:55Z","published":"2023-11-20T08:29:55Z","title":"A Multi-In-Single-Out Network for Video Frame Interpolation without\n Optical Flow","summary":" In general, deep learning-based video frame interpolation (VFI) methods have\npredominantly focused on estimating motion vectors between two input frames and\nwarping them to the target time. While this approach has shown impressive\nperformance for linear motion between two input frames, it exhibits limitations\nwhen dealing with occlusions and nonlinear movements. Recently, generative\nmodels have been applied to VFI to address these issues. However, as VFI is not\na task focused on generating plausible images, but rather on predicting\naccurate intermediate frames between two given frames, performance limitations\nstill persist. In this paper, we propose a multi-in-single-out (MISO) based VFI\nmethod that does not rely on motion vector estimation, allowing it to\neffectively model occlusions and nonlinear motion. Additionally, we introduce a\nnovel motion perceptual loss that enables MISO-VFI to better capture the\nspatio-temporal correlations within the video frames. Our MISO-VFI method\nachieves state-of-the-art results on VFI benchmarks Vimeo90K, Middlebury, and\nUCF101, with a significant performance gap compared to existing approaches.\n","authors":["Jaemin Lee","Minseok Seo","Sangwoo Lee","Hyobin Park","Dong-Geol Choi"],"pdf_url":"https://arxiv.org/pdf/2311.11602v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11600v1","updated":"2023-11-20T08:27:56Z","published":"2023-11-20T08:27:56Z","title":"Deep Equilibrium Diffusion Restoration with Parallel Sampling","summary":" Diffusion-based image restoration (IR) methods aim to use diffusion models to\nrecover high-quality (HQ) images from degraded images and achieve promising\nperformance. Due to the inherent property of diffusion models, most of these\nmethods need long serial sampling chains to restore HQ images step-by-step. As\na result, it leads to expensive sampling time and high computation costs.\nMoreover, such long sampling chains hinder understanding the relationship\nbetween the restoration results and the inputs since it is hard to compute the\ngradients in the whole chains. In this work, we aim to rethink the\ndiffusion-based IR models through a different perspective, i.e., a deep\nequilibrium (DEQ) fixed point system. Specifically, we derive an analytical\nsolution by modeling the entire sampling chain in diffusion-based IR models as\na joint multivariate fixed point system. With the help of the analytical\nsolution, we are able to conduct single-image sampling in a parallel way and\nrestore HQ images without training. Furthermore, we compute fast gradients in\nDEQ and found that initialization optimization can boost performance and\ncontrol the generation direction. Extensive experiments on benchmarks\ndemonstrate the effectiveness of our proposed method on typical IR tasks and\nreal-world settings. The code and models will be made publicly available.\n","authors":["Jiezhang Cao","Yue Shi","Kai Zhang","Yulun Zhang","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.11600v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02358v3","updated":"2023-11-20T08:15:57Z","published":"2023-11-04T09:57:50Z","title":"Domain Transfer in Latent Space (DTLS) Wins on Image Super-Resolution --\n a Non-Denoising Model","summary":" Large scale image super-resolution is a challenging computer vision task,\nsince vast information is missing in a highly degraded image, say for example\nforscale x16 super-resolution. Diffusion models are used successfully in recent\nyears in extreme super-resolution applications, in which Gaussian noise is used\nas a means to form a latent photo-realistic space, and acts as a link between\nthe space of latent vectors and the latent photo-realistic space. There are\nquite a few sophisticated mathematical derivations on mapping the statistics of\nGaussian noises making Diffusion Models successful. In this paper we propose a\nsimple approach which gets away from using Gaussian noise but adopts some basic\nstructures of diffusion models for efficient image super-resolution.\nEssentially, we propose a DNN to perform domain transfer between neighbor\ndomains, which can learn the differences in statistical properties to\nfacilitate gradual interpolation with results of reasonable quality. Further\nquality improvement is achieved by conditioning the domain transfer with\nreference to the input LR image. Experimental results show that our method\noutperforms not only state-of-the-art large scale super resolution models, but\nalso the current diffusion models for image super-resolution. The approach can\nreadily be extended to other image-to-image tasks, such as image enlightening,\ninpainting, denoising, etc.\n","authors":["Chun-Chuen Hui","Wan-Chi Siu","Ngai-Fong Law"],"pdf_url":"https://arxiv.org/pdf/2311.02358v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11592v1","updated":"2023-11-20T08:09:54Z","published":"2023-11-20T08:09:54Z","title":"Predicting urban tree cover from incomplete point labels and limited\n background information","summary":" Trees inside cities are important for the urban microclimate, contributing\npositively to the physical and mental health of the urban dwellers. Despite\ntheir importance, often only limited information about city trees is available.\nTherefore in this paper, we propose a method for mapping urban trees in\nhigh-resolution aerial imagery using limited datasets and deep learning. Deep\nlearning has become best-practice for this task, however, existing approaches\nrely on large and accurately labelled training datasets, which can be difficult\nand expensive to obtain. However, often noisy and incomplete data may be\navailable that can be combined and utilized to solve more difficult tasks than\nthose datasets were intended for. This paper studies how to combine accurate\npoint labels of urban trees along streets with crowd-sourced annotations from\nan open geographic database to delineate city trees in remote sensing images, a\ntask which is challenging even for humans. To that end, we perform semantic\nsegmentation of very high resolution aerial imagery using a fully convolutional\nneural network. The main challenge is that our segmentation maps are sparsely\nannotated and incomplete. Small areas around the point labels of the street\ntrees coming from official and crowd-sourced data are marked as foreground\nclass. Crowd-sourced annotations of streets, buildings, etc. define the\nbackground class. Since the tree data is incomplete, we introduce a masking to\navoid class confusion. Our experiments in Hamburg, Germany, showed that the\nsystem is able to produce tree cover maps, not limited to trees along streets,\nwithout providing tree delineations. We evaluated the method on manually\nlabelled trees and show that performance drastically deteriorates if the open\ngeographic database is not used.\n","authors":["Hui Zhang","Ankit Kariryaa","Venkanna Babu Guthula","Christian Igel","Stefan Oehmcke"],"pdf_url":"https://arxiv.org/pdf/2311.11592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.06335v2","updated":"2023-11-20T08:03:40Z","published":"2023-02-13T13:12:55Z","title":"Online Arbitrary Shaped Clustering through Correlated Gaussian Functions","summary":" There is no convincing evidence that backpropagation is a biologically\nplausible mechanism, and further studies of alternative learning methods are\nneeded. A novel online clustering algorithm is presented that can produce\narbitrary shaped clusters from inputs in an unsupervised manner, and requires\nno prior knowledge of the number of clusters in the input data. This is\nachieved by finding correlated outputs from functions that capture commonly\noccurring input patterns. The algorithm can be deemed more biologically\nplausible than model optimization through backpropagation, although practical\napplicability may require additional research. However, the method yields\nsatisfactory results on several toy datasets on a noteworthy range of\nhyperparameters.\n","authors":["Ole Christian Eidheim"],"pdf_url":"https://arxiv.org/pdf/2302.06335v2.pdf","comment":"Corrected uniform distribution range; removed \"average\" from last\n sentence in section 4"},{"id":"http://arxiv.org/abs/2311.11590v1","updated":"2023-11-20T08:03:12Z","published":"2023-11-20T08:03:12Z","title":"Advancing Urban Renewal: An Automated Approach to Generating Historical\n Arcade Facades with Stable Diffusion Models","summary":" Urban renewal and transformation processes necessitate the preservation of\nthe historical urban fabric, particularly in districts known for their\narchitectural and historical significance. These regions, with their diverse\narchitectural styles, have traditionally required extensive preliminary\nresearch, often leading to subjective results. However, the advent of machine\nlearning models has opened up new avenues for generating building facade\nimages. Despite this, creating high-quality images for historical district\nrenovations remains challenging, due to the complexity and diversity inherent\nin such districts. In response to these challenges, our study introduces a new\nmethodology for automatically generating images of historical arcade facades,\nutilizing Stable Diffusion models conditioned on textual descriptions. By\nclassifying and tagging a variety of arcade styles, we have constructed several\nrealistic arcade facade image datasets. We trained multiple low-rank adaptation\n(LoRA) models to control the stylistic aspects of the generated images,\nsupplemented by ControlNet models for improved precision and authenticity. Our\napproach has demonstrated high levels of precision, authenticity, and diversity\nin the generated images, showing promising potential for real-world urban\nrenewal projects. This new methodology offers a more efficient and accurate\nalternative to conventional design processes in urban renewal, bypassing issues\nof unconvincing image details, lack of precision, and limited stylistic\nvariety. Future research could focus on integrating this two-dimensional image\ngeneration with three-dimensional modeling techniques, providing a more\ncomprehensive solution for renovating architectural facades in historical\ndistricts.\n","authors":["Zheyuan Kuang","Jiaxin Zhang","Yiying Huang","Yunqin Li"],"pdf_url":"https://arxiv.org/pdf/2311.11590v1.pdf","comment":"HABITS OF THE ANTHROPOCENE - Proceedings of the 43rd ACADIA\n Conference - Volume II: Proceedings book one, University of Colorado Denver,\n Denver, Colorado, USA, 26-28 October 2023, pp. 616-625, CUMINCAD, 2023"},{"id":"http://arxiv.org/abs/2308.12313v2","updated":"2023-11-20T08:01:17Z","published":"2023-08-23T07:11:58Z","title":"Gaze Estimation on Spresense","summary":" Gaze estimation is a valuable technology with numerous applications in fields\nsuch as human-computer interaction, virtual reality, and medicine. This report\npresents the implementation of a gaze estimation system using the Sony\nSpresense microcontroller board and explores its performance in latency,\nMAC/cycle, and power consumption. The report also provides insights into the\nsystem's architecture, including the gaze estimation model used. Additionally,\na demonstration of the system is presented, showcasing its functionality and\nperformance. Our lightweight model TinyTrackerS is a mere 169Kb in size, using\n85.8k parameters and runs on the Spresense platform at 3 FPS.\n","authors":["Thomas Ruegg","Pietro Bonazzi","Andrea Ronco"],"pdf_url":"https://arxiv.org/pdf/2308.12313v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07813v5","updated":"2023-11-20T08:00:38Z","published":"2023-07-15T14:34:25Z","title":"TinyTracker: Ultra-Fast and Ultra-Low-Power Edge Vision In-Sensor for\n Gaze Estimation","summary":" Intelligent edge vision tasks encounter the critical challenge of ensuring\npower and latency efficiency due to the typically heavy computational load they\nimpose on edge platforms.This work leverages one of the first \"AI in sensor\"\nvision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power\nend-to-end edge vision applications. We evaluate the IMX500 and compare it to\nother edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by\nexploring gaze estimation as a case study. We propose TinyTracker, a highly\nefficient, fully quantized model for 2D gaze estimation designed to maximize\nthe performance of the edge vision systems considered in this study.\nTinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1]\nwithout significant loss in gaze estimation accuracy (maximum of 0.16 cm when\nfully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor\nresults in end-to-end latency of around 19ms. The camera takes around 17.9ms to\nread, process and transmit the pixels to the accelerator. The inference time of\nthe network is 0.86ms with an additional 0.24 ms for retrieving the results\nfrom the sensor. The overall energy consumption of the end-to-end system is 4.9\nmJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is\n1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ\nVS 34.2mJ)\n","authors":["Pietro Bonazzi","Thomas Ruegg","Sizhen Bian","Yawei Li","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2307.07813v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11587v1","updated":"2023-11-20T07:54:54Z","published":"2023-11-20T07:54:54Z","title":"AKConv: Convolutional Kernel with Arbitrary Sampled Shapes and Arbitrary\n Number of Parameters","summary":" Neural networks based on convolutional operations have achieved remarkable\nresults in the field of deep learning, but there are two inherent flaws in\nstandard convolutional operations. On the one hand, the convolution operation\nbe confined to a local window and cannot capture information from other\nlocations, and its sampled shapes is fixed. On the other hand, the size of the\nconvolutional kernel is fixed to k $\\times$ k, which is a fixed square shape,\nand the number of parameters tends to grow squarely with size. It is obvious\nthat the shape and size of targets are various in different datasets and at\ndifferent locations. Convolutional kernels with fixed sample shapes and squares\ndo not adapt well to changing targets. In response to the above questions, the\nAlterable Kernel Convolution (AKConv) is explored in this work, which gives the\nconvolution kernel an arbitrary number of parameters and arbitrary sampled\nshapes to provide richer options for the trade-off between network overhead and\nperformance. In AKConv, we define initial positions for convolutional kernels\nof arbitrary size by means of a new coordinate generation algorithm. To adapt\nto changes for targets, we introduce offsets to adjust the shape of the samples\nat each position. Moreover, we explore the effect of the neural network by\nusing the AKConv with the same size and different initial sampled shapes.\nAKConv completes the process of efficient feature extraction by irregular\nconvolutional operations and brings more exploration options for convolutional\nsampling shapes. Object detection experiments on representative datasets\nCOCO2017, VOC 7+12 and VisDrone-DET2021 fully demonstrate the advantages of\nAKConv. AKConv can be used as a plug-and-play convolutional operation to\nreplace convolutional operations to improve network performance. The code for\nthe relevant tasks can be found at https://github.com/CV-ZhangXin/AKConv.\n","authors":["Xin Zhang","Yingze Song","Tingting Song","Degang Yang","Yichen Ye","Jie Zhou","Liming Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.11587v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.11580v1","updated":"2023-11-20T07:34:01Z","published":"2023-11-20T07:34:01Z","title":"SeaDSC: A video-based unsupervised method for dynamic scene change\n detection in unmanned surface vehicles","summary":" Recently, there has been an upsurge in the research on maritime vision, where\na lot of works are influenced by the application of computer vision for\nUnmanned Surface Vehicles (USVs). Various sensor modalities such as camera,\nradar, and lidar have been used to perform tasks such as object detection,\nsegmentation, object tracking, and motion planning. A large subset of this\nresearch is focused on the video analysis, since most of the current vessel\nfleets contain the camera's onboard for various surveillance tasks. Due to the\nvast abundance of the video data, video scene change detection is an initial\nand crucial stage for scene understanding of USVs. This paper outlines our\napproach to detect dynamic scene changes in USVs. To the best of our\nunderstanding, this work represents the first investigation of scene change\ndetection in the maritime vision application. Our objective is to identify\nsignificant changes in the dynamic scenes of maritime video data, particularly\nthose scenes that exhibit a high degree of resemblance. In our system for\ndynamic scene change detection, we propose completely unsupervised learning\nmethod. In contrast to earlier studies, we utilize a modified cutting-edge\ngenerative picture model called VQ-VAE-2 to train on multiple marine datasets,\naiming to enhance the feature extraction. Next, we introduce our innovative\nsimilarity scoring technique for directly calculating the level of similarity\nin a sequence of consecutive frames by utilizing grid calculation on retrieved\nfeatures. The experiments were conducted using a nautical video dataset called\nRoboWhaler to showcase the efficient performance of our technique.\n","authors":["Linh Trinh","Ali Anwar","Siegfried Mercelis"],"pdf_url":"https://arxiv.org/pdf/2311.11580v1.pdf","comment":"WACV 2024 conference"},{"id":"http://arxiv.org/abs/2311.11578v1","updated":"2023-11-20T07:29:33Z","published":"2023-11-20T07:29:33Z","title":"A 3D Multi-Style Cross-Modality Segmentation Framework for Segmenting\n Vestibular Schwannoma and Cochlea","summary":" The crossMoDA2023 challenge aims to segment the vestibular schwannoma\n(sub-divided into intra- and extra-meatal components) and cochlea regions of\nunlabeled hrT2 scans by leveraging labeled ceT1 scans. In this work, we\nproposed a 3D multi-style cross-modality segmentation framework for the\ncrossMoDA2023 challenge, including the multi-style translation and\nself-training segmentation phases. Considering heterogeneous distributions and\nvarious image sizes in multi-institutional scans, we first utilize the min-max\nnormalization, voxel size resampling, and center cropping to obtain fixed-size\nsub-volumes from ceT1 and hrT2 scans for training. Then, we perform the\nmulti-style image translation phase to overcome the intensity distribution\ndiscrepancy between unpaired multi-modal scans. Specifically, we design three\ndifferent translation networks with 2D or 2.5D inputs to generate multi-style\nand realistic target-like volumes from labeled ceT1 volumes. Finally, we\nperform the self-training volumetric segmentation phase in the target domain,\nwhich employs the nnU-Net framework and iterative self-training method using\npseudo-labels for training accurate segmentation models in the unlabeled target\ndomain. On the crossMoDA2023 validation dataset, our method produces promising\nresults and achieves the mean DSC values of 72.78% and 80.64% and ASSD values\nof 5.85 mm and 0.25 mm for VS tumor and cochlea regions, respectively.\nMoreover, for intra- and extra-meatal regions, our method achieves the DSC\nvalues of 59.77% and 77.14%, respectively.\n","authors":["Yuzhou Zhuang"],"pdf_url":"https://arxiv.org/pdf/2311.11578v1.pdf","comment":"Technical report of cmda2023 challenge"},{"id":"http://arxiv.org/abs/2311.11570v1","updated":"2023-11-20T07:10:39Z","published":"2023-11-20T07:10:39Z","title":"Decoupled DETR For Few-shot Object Detection","summary":" Few-shot object detection (FSOD), an efficient method for addressing the\nsevere data-hungry problem, has been extensively discussed. Current works have\nsignificantly advanced the problem in terms of model and data. However, the\noverall performance of most FSOD methods still does not fulfill the desired\naccuracy. In this paper we improve the FSOD model to address the severe issue\nof sample imbalance and weak feature propagation. To alleviate modeling bias\nfrom data-sufficient base classes, we examine the effect of decoupling the\nparameters for classes with sufficient data and classes with few samples in\nvarious ways. We design a base-novel categories decoupled DETR (DeDETR) for\nFSOD. We also explore various types of skip connection between the encoder and\ndecoder for DETR. Besides, we notice that the best outputs could come from the\nintermediate layer of the decoder instead of the last layer; therefore, we\nbuild a unified decoder module that could dynamically fuse the decoder layers\nas the output feature. We evaluate our model on commonly used datasets such as\nPASCAL VOC and MSCOCO. Our results indicate that our proposed module could\nachieve stable improvements of 5% to 10% in both fine-tuning and meta-learning\nparadigms and has outperformed the highest score in recent works.\n","authors":["Zeyu Shangguan","Lian Huai","Tong Liu","Xingqun Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.11570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11567v1","updated":"2023-11-20T07:06:31Z","published":"2023-11-20T07:06:31Z","title":"CORE-MM: Complex Open-Ended Reasoning Evaluation For Multi-Modal Large\n Language Models","summary":" Multi-modal Large Language Models (MLLMs) are increasingly prominent in the\nfield of artificial intelligence. These models not only excel in traditional\nvision-language tasks but also demonstrate impressive performance in\ncontemporary multi-modal benchmarks. Although many of these benchmarks attempt\nto holistically evaluate MLLMs, they typically concentrate on basic reasoning\ntasks, often yielding only simple yes/no or multi-choice responses. These\nmethods naturally lead to confusion and difficulties in conclusively\ndetermining the reasoning capabilities of MLLMs. To mitigate this issue, we\nmanually curate a benchmark dataset specifically designed for MLLMs, with a\nfocus on complex reasoning tasks. Our benchmark comprises three key reasoning\ncategories: deductive, abductive, and analogical reasoning. The queries in our\ndataset are intentionally constructed to engage the reasoning capabilities of\nMLLMs in the process of generating answers. For a fair comparison across\nvarious MLLMs, we incorporate intermediate reasoning steps into our evaluation\ncriteria. In instances where an MLLM is unable to produce a definitive answer,\nits reasoning ability is evaluated by requesting intermediate reasoning steps.\nIf these steps align with our manual annotations, appropriate scores are\nassigned. This evaluation scheme resembles methods commonly used in human\nassessments, such as exams or assignments, and represents what we consider a\nmore effective assessment technique compared with existing benchmarks. We\nevaluate a selection of representative MLLMs using this rigorously developed\nopen-ended multi-step elaborate reasoning benchmark, designed to challenge and\naccurately measure their reasoning capabilities. The code and data will be\nreleased at https://core-mm.github.io/\n","authors":["Xiaotian Han","Quanzeng You","Yongfei Liu","Wentao Chen","Huangjie Zheng","Khalil Mrini","Xudong Lin","Yiqi Wang","Bohan Zhai","Jianbo Yuan","Heng Wang","Hongxia Yang"],"pdf_url":"https://arxiv.org/pdf/2311.11567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11566v1","updated":"2023-11-20T07:04:46Z","published":"2023-11-20T07:04:46Z","title":"Does complimentary information from multispectral imaging improve face\n presentation attack detection?","summary":" Presentation Attack Detection (PAD) has been extensively studied,\nparticularly in the visible spectrum. With the advancement of sensing\ntechnology beyond the visible range, multispectral imaging has gained\nsignificant attention in this direction. We present PAD based on multispectral\nimages constructed for eight different presentation artifacts resulted from\nthree different artifact species. In this work, we introduce Face Presentation\nAttack Multispectral (FPAMS) database to demonstrate the significance of\nemploying multispectral imaging. The goal of this work is to study\ncomplementary information that can be combined in two different ways (image\nfusion and score fusion) from multispectral imaging to improve the face PAD.\nThe experimental evaluation results present an extensive qualitative analysis\nof 61650 sample multispectral images collected for bonafide and artifacts. The\nPAD based on the score fusion and image fusion method presents superior\nperformance, demonstrating the significance of employing multispectral imaging\nto detect presentation artifacts.\n","authors":["Narayan Vetrekar","Raghavendra Ramachandra","Sushma Venkatesh","Jyoti D. Pawar","R. S. Gad"],"pdf_url":"https://arxiv.org/pdf/2311.11566v1.pdf","comment":"Accepted in International IEEE Applied Sensing Conference (IEEE\n APSCON) 2024"},{"id":"http://arxiv.org/abs/2306.05238v2","updated":"2023-11-20T06:57:05Z","published":"2023-06-08T14:36:10Z","title":"SparseTrack: Multi-Object Tracking by Performing Scene Decomposition\n based on Pseudo-Depth","summary":" Exploring robust and efficient association methods has always been an\nimportant issue in multiple-object tracking (MOT). Although existing tracking\nmethods have achieved impressive performance, congestion and frequent\nocclusions still pose challenging problems in multi-object tracking. We reveal\nthat performing sparse decomposition on dense scenes is a crucial step to\nenhance the performance of associating occluded targets. To this end, we\npropose a pseudo-depth estimation method for obtaining the relative depth of\ntargets from 2D images. Secondly, we design a depth cascading matching (DCM)\nalgorithm, which can use the obtained depth information to convert a dense\ntarget set into multiple sparse target subsets and perform data association on\nthese sparse target subsets in order from near to far. By integrating the\npseudo-depth method and the DCM strategy into the data association process, we\npropose a new tracker, called SparseTrack. SparseTrack provides a new\nperspective for solving the challenging crowded scene MOT problem. Only using\nIoU matching, SparseTrack achieves comparable performance with the\nstate-of-the-art (SOTA) methods on the MOT17 and MOT20 benchmarks. Code and\nmodels are publicly available at \\url{https://github.com/hustvl/SparseTrack}.\n","authors":["Zelin Liu","Xinggang Wang","Cheng Wang","Wenyu Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2306.05238v2.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2305.02722v4","updated":"2023-11-20T06:34:16Z","published":"2023-05-04T10:43:11Z","title":"Avatar Knowledge Distillation: Self-ensemble Teacher Paradigm with\n Uncertainty","summary":" Knowledge distillation is an effective paradigm for boosting the performance\nof pocket-size model, especially when multiple teacher models are available,\nthe student would break the upper limit again. However, it is not economical to\ntrain diverse teacher models for the disposable distillation. In this paper, we\nintroduce a new concept dubbed Avatars for distillation, which are the\ninference ensemble models derived from the teacher. Concretely, (1) For each\niteration of distillation training, various Avatars are generated by a\nperturbation transformation. We validate that Avatars own higher upper limit of\nworking capacity and teaching ability, aiding the student model in learning\ndiverse and receptive knowledge perspectives from the teacher model. (2) During\nthe distillation, we propose an uncertainty-aware factor from the variance of\nstatistical differences between the vanilla teacher and Avatars, to adjust\nAvatars' contribution on knowledge transfer adaptively. Avatar Knowledge\nDistillation AKD is fundamentally different from existing methods and refines\nwith the innovative view of unequal training. Comprehensive experiments\ndemonstrate the effectiveness of our Avatars mechanism, which polishes up the\nstate-of-the-art distillation methods for dense prediction without more extra\ncomputational cost. The AKD brings at most 0.7 AP gains on COCO 2017 for Object\nDetection and 1.83 mIoU gains on Cityscapes for Semantic Segmentation,\nrespectively. Code is available at https://github.com/Gumpest/AvatarKD.\n","authors":["Yuan Zhang","Weihua Chen","Yichen Lu","Tao Huang","Xiuyu Sun","Jian Cao"],"pdf_url":"https://arxiv.org/pdf/2305.02722v4.pdf","comment":"Accepted by ACM MM 2023"},{"id":"http://arxiv.org/abs/2209.12699v3","updated":"2023-11-20T06:26:47Z","published":"2022-09-23T08:14:30Z","title":"Accurate and Efficient Stereo Matching via Attention Concatenation\n Volume","summary":" Stereo matching is a fundamental building block for many vision and robotics\napplications. An informative and concise cost volume representation is vital\nfor stereo matching of high accuracy and efficiency. In this paper, we present\na novel cost volume construction method, named attention concatenation volume\n(ACV), which generates attention weights from correlation clues to suppress\nredundant information and enhance matching-related information in the\nconcatenation volume. The ACV can be seamlessly embedded into most stereo\nmatching networks, the resulting networks can use a more lightweight\naggregation network and meanwhile achieve higher accuracy. We further design a\nfast version of ACV to enable real-time performance, named Fast-ACV, which\ngenerates high likelihood disparity hypotheses and the corresponding attention\nweights from low-resolution correlation clues to significantly reduce\ncomputational and memory cost and meanwhile maintain a satisfactory accuracy.\nThe core idea of our Fast-ACV is volume attention propagation (VAP) which can\nautomatically select accurate correlation values from an upsampled correlation\nvolume and propagate these accurate values to the surroundings pixels with\nambiguous correlation clues. Furthermore, we design a highly accurate network\nACVNet and a real-time network Fast-ACVNet based on our ACV and Fast-ACV\nrespectively, which achieve the state-of-the-art performance on several\nbenchmarks (i.e., our ACVNet ranks the 2nd on KITTI 2015 and Scene Flow, and\nthe 3rd on KITTI 2012 and ETH3D among all the published methods; our\nFast-ACVNet outperforms almost all state-of-the-art real-time methods on Scene\nFlow, KITTI 2012 and 2015 and meanwhile has better generalization ability)\n","authors":["Gangwei Xu","Yun Wang","Junda Cheng","Jinhui Tang","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2209.12699v3.pdf","comment":"Accepted to TPAMI 2023. arXiv admin note: substantial text overlap\n with arXiv:2203.02146"},{"id":"http://arxiv.org/abs/2311.11555v1","updated":"2023-11-20T06:15:46Z","published":"2023-11-20T06:15:46Z","title":"NePF: Neural Photon Field for Single-Stage Inverse Rendering","summary":" We present a novel single-stage framework, Neural Photon Field (NePF), to\naddress the ill-posed inverse rendering from multi-view images. Contrary to\nprevious methods that recover the geometry, material, and illumination in\nmultiple stages and extract the properties from various multi-layer perceptrons\nacross different neural fields, we question such complexities and introduce our\nmethod - a single-stage framework that uniformly recovers all properties. NePF\nachieves this unification by fully utilizing the physical implication behind\nthe weight function of neural implicit surfaces and the view-dependent\nradiance. Moreover, we introduce an innovative coordinate-based illumination\nmodel for rapid volume physically-based rendering. To regularize this\nillumination, we implement the subsurface scattering model for diffuse\nestimation. We evaluate our method on both real and synthetic datasets. The\nresults demonstrate the superiority of our approach in recovering high-fidelity\ngeometry and visual-plausible material attributes.\n","authors":["Tuen-Yue Tsui","Qin Zou"],"pdf_url":"https://arxiv.org/pdf/2311.11555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03358v2","updated":"2023-11-20T06:08:28Z","published":"2023-10-05T07:29:29Z","title":"Enhancing Robust Representation in Adversarial Training: Alignment and\n Exclusion Criteria","summary":" Deep neural networks are vulnerable to adversarial noise. Adversarial\nTraining (AT) has been demonstrated to be the most effective defense strategy\nto protect neural networks from being fooled. However, we find AT omits to\nlearning robust features, resulting in poor performance of adversarial\nrobustness. To address this issue, we highlight two criteria of robust\nrepresentation: (1) Exclusion: \\emph{the feature of examples keeps away from\nthat of other classes}; (2) Alignment: \\emph{the feature of natural and\ncorresponding adversarial examples is close to each other}. These motivate us\nto propose a generic framework of AT to gain robust representation, by the\nasymmetric negative contrast and reverse attention. Specifically, we design an\nasymmetric negative contrast based on predicted probabilities, to push away\nexamples of different classes in the feature space. Moreover, we propose to\nweight feature by parameters of the linear classifier as the reverse attention,\nto obtain class-aware feature and pull close the feature of the same class.\nEmpirical evaluations on three benchmark datasets show our methods greatly\nadvance the robustness of AT and achieve state-of-the-art performance.\n","authors":["Nuoyan Zhou","Nannan Wang","Decheng Liu","Dawei Zhou","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2310.03358v2.pdf","comment":"10 pages, 9 figures, Submitted to TIFS"},{"id":"http://arxiv.org/abs/2311.11549v1","updated":"2023-11-20T06:04:09Z","published":"2023-11-20T06:04:09Z","title":"Unearthing Common Inconsistency for Generalisable Deepfake Detection","summary":" Deepfake has emerged for several years, yet efficient detection techniques\ncould generalize over different manipulation methods require further research.\nWhile current image-level detection method fails to generalize to unseen\ndomains, owing to the domain-shift phenomenon brought by CNN's strong inductive\nbias towards Deepfake texture, video-level one shows its potential to have both\ngeneralization across multiple domains and robustness to compression. We argue\nthat although distinct face manipulation tools have different inherent bias,\nthey all disrupt the consistency between frames, which is a natural\ncharacteristic shared by authentic videos. Inspired by this, we proposed a\ndetection approach by capturing frame inconsistency that broadly exists in\ndifferent forgery techniques, termed unearthing-common-inconsistency (UCI).\nConcretely, the UCI network based on self-supervised contrastive learning can\nbetter distinguish temporal consistency between real and fake videos from\nmultiple domains. We introduced a temporally-preserved module method to\nintroduce spatial noise perturbations, directing the model's attention towards\ntemporal information. Subsequently, leveraging a multi-view cross-correlation\nlearning module, we extensively learn the disparities in temporal\nrepresentations between genuine and fake samples. Extensive experiments\ndemonstrate the generalization ability of our method on unseen Deepfake\ndomains.\n","authors":["Beilin Chu","Xuan Xu","Weike You","Linna Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.11549v1.pdf","comment":"9 pages, 2 figures and 5 tables"},{"id":"http://arxiv.org/abs/2311.11533v1","updated":"2023-11-20T04:36:19Z","published":"2023-11-20T04:36:19Z","title":"Event Camera Data Dense Pre-training","summary":" This paper introduces a self-supervised learning framework designed for\npre-training neural networks tailored to dense prediction tasks using event\ncamera data. Our approach utilizes solely event data for training.\n Transferring achievements from dense RGB pre-training directly to event\ncamera data yields subpar performance. This is attributed to the spatial\nsparsity inherent in an event image (converted from event data), where many\npixels do not contain information. To mitigate this sparsity issue, we encode\nan event image into event patch features, automatically mine contextual\nsimilarity relationships among patches, group the patch features into\ndistinctive contexts, and enforce context-to-context similarities to learn\ndiscriminative event features.\n For training our framework, we curate a synthetic event camera dataset\nfeaturing diverse scene and motion patterns. Transfer learning performance on\ndownstream dense prediction tasks illustrates the superiority of our method\nover state-of-the-art approaches. Notably, our single model secured the top\nposition in the challenging DSEC-Flow benchmark.\n","authors":["Yan Yang","Liyuan Pan","Liu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.11533v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11525v1","updated":"2023-11-20T04:11:16Z","published":"2023-11-20T04:11:16Z","title":"Generalized Category Discovery in Semantic Segmentation","summary":" This paper explores a novel setting called Generalized Category Discovery in\nSemantic Segmentation (GCDSS), aiming to segment unlabeled images given prior\nknowledge from a labeled set of base classes. The unlabeled images contain\npixels of the base class or novel class. In contrast to Novel Category\nDiscovery in Semantic Segmentation (NCDSS), there is no prerequisite for prior\nknowledge mandating the existence of at least one novel class in each unlabeled\nimage. Besides, we broaden the segmentation scope beyond foreground objects to\ninclude the entire image. Existing NCDSS methods rely on the aforementioned\npriors, making them challenging to truly apply in real-world situations. We\npropose a straightforward yet effective framework that reinterprets the GCDSS\nchallenge as a task of mask classification. Additionally, we construct a\nbaseline method and introduce the Neighborhood Relations-Guided Mask Clustering\nAlgorithm (NeRG-MaskCA) for mask categorization to address the fragmentation in\nsemantic representation. A benchmark dataset, Cityscapes-GCD, derived from the\nCityscapes dataset, is established to evaluate the GCDSS framework. Our method\ndemonstrates the feasibility of the GCDSS problem and the potential for\ndiscovering and segmenting novel object classes in unlabeled images. We employ\nthe generated pseudo-labels from our approach as ground truth to supervise the\ntraining of other models, thereby enabling them with the ability to segment\nnovel classes. It paves the way for further research in generalized category\ndiscovery, broadening the horizons of semantic segmentation and its\napplications. For details, please visit https://github.com/JethroPeng/GCDSS\n","authors":["Zhengyuan Peng","Qijian Tian","Jianqing Xu","Yizhang Jin","Xuequan Lu","Xin Tan","Yuan Xie","Lizhuang Ma"],"pdf_url":"https://arxiv.org/pdf/2311.11525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11520v1","updated":"2023-11-20T03:51:39Z","published":"2023-11-20T03:51:39Z","title":"Liver Tumor Prediction with Advanced Attention Mechanisms Integrated\n into a Depth-Based Variant Search Algorithm","summary":" In recent days, Deep Learning (DL) techniques have become an emerging\ntransformation in the field of machine learning, artificial intelligence,\ncomputer vision, and so on. Subsequently, researchers and industries have been\nhighly endorsed in the medical field, predicting and controlling diverse\ndiseases at specific intervals. Liver tumor prediction is a vital chore in\nanalyzing and treating liver diseases. This paper proposes a novel approach for\npredicting liver tumors using Convolutional Neural Networks (CNN) and a\ndepth-based variant search algorithm with advanced attention mechanisms\n(CNN-DS-AM). The proposed work aims to improve accuracy and robustness in\ndiagnosing and treating liver diseases. The anticipated model is assessed on a\nComputed Tomography (CT) scan dataset containing both benign and malignant\nliver tumors. The proposed approach achieved high accuracy in predicting liver\ntumors, outperforming other state-of-the-art methods. Additionally, advanced\nattention mechanisms were incorporated into the CNN model to enable the\nidentification and highlighting of regions of the CT scans most relevant to\npredicting liver tumors. The results suggest that incorporating attention\nmechanisms and a depth-based variant search algorithm into the CNN model is a\npromising approach for improving the accuracy and robustness of liver tumor\nprediction. It can assist radiologists in their diagnosis and treatment\nplanning. The proposed system achieved a high accuracy of 95.5% in predicting\nliver tumors, outperforming other state-of-the-art methods.\n","authors":["P. Kalaiselvi","S. Anusuya"],"pdf_url":"https://arxiv.org/pdf/2311.11520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11512v1","updated":"2023-11-20T03:23:03Z","published":"2023-11-20T03:23:03Z","title":"Seeing through the Mask: Multi-task Generative Mask Decoupling Face\n Recognition","summary":" The outbreak of COVID-19 pandemic make people wear masks more frequently than\never. Current general face recognition system suffers from serious performance\ndegradation,when encountering occluded scenes. The potential reason is that\nface features are corrupted by occlusions on key facial regions. To tackle this\nproblem, previous works either extract identity-related embeddings on feature\nlevel by additional mask prediction, or restore the occluded facial part by\ngenerative models. However, the former lacks visual results for model\ninterpretation, while the latter suffers from artifacts which may affect\ndownstream recognition. Therefore, this paper proposes a Multi-task gEnerative\nmask dEcoupling face Recognition (MEER) network to jointly handle these two\ntasks, which can learn occlusionirrelevant and identity-related representation\nwhile achieving unmasked face synthesis. We first present a novel mask\ndecoupling module to disentangle mask and identity information, which makes the\nnetwork obtain purer identity features from visible facial components. Then, an\nunmasked face is restored by a joint-training strategy, which will be further\nused to refine the recognition network with an id-preserving loss. Experiments\non masked face recognition under realistic and synthetic occlusions benchmarks\ndemonstrate that the MEER can outperform the state-ofthe-art methods.\n","authors":["Zhaohui Wang","Sufang Zhang","Jianteng Peng","Xinyi Wang","Yandong Guo"],"pdf_url":"https://arxiv.org/pdf/2311.11512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02783v2","updated":"2023-11-20T03:15:27Z","published":"2023-07-06T05:22:20Z","title":"UIT-Saviors at MEDVQA-GI 2023: Improving Multimodal Learning with Image\n Enhancement for Gastrointestinal Visual Question Answering","summary":" In recent years, artificial intelligence has played an important role in\nmedicine and disease diagnosis, with many applications to be mentioned, one of\nwhich is Medical Visual Question Answering (MedVQA). By combining computer\nvision and natural language processing, MedVQA systems can assist experts in\nextracting relevant information from medical image based on a given question\nand providing precise diagnostic answers. The ImageCLEFmed-MEDVQA-GI-2023\nchallenge carried out visual question answering task in the gastrointestinal\ndomain, which includes gastroscopy and colonoscopy images. Our team approached\nTask 1 of the challenge by proposing a multimodal learning method with image\nenhancement to improve the VQA performance on gastrointestinal images. The\nmultimodal architecture is set up with BERT encoder and different pre-trained\nvision models based on convolutional neural network (CNN) and Transformer\narchitecture for features extraction from question and endoscopy image. The\nresult of this study highlights the dominance of Transformer-based vision\nmodels over the CNNs and demonstrates the effectiveness of the image\nenhancement process, with six out of the eight vision models achieving better\nF1-Score. Our best method, which takes advantages of BERT+BEiT fusion and image\nenhancement, achieves up to 87.25% accuracy and 91.85% F1-Score on the\ndevelopment test set, while also producing good result on the private test set\nwith accuracy of 82.01%.\n","authors":["Triet M. Thai","Anh T. Vo","Hao K. Tieu","Linh N. P. Bui","Thien T. B. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2307.02783v2.pdf","comment":"ImageCLEF2023 published version:\n https://ceur-ws.org/Vol-3497/paper-129.pdf"},{"id":"http://arxiv.org/abs/2310.19909v2","updated":"2023-11-20T03:05:50Z","published":"2023-10-30T18:23:58Z","title":"Battle of the Backbones: A Large-Scale Comparison of Pretrained Models\n across Computer Vision Tasks","summary":" Neural network based computer vision systems are typically built on a\nbackbone, a pretrained or randomly initialized feature extractor. Several years\nago, the default option was an ImageNet-trained convolutional neural network.\nHowever, the recent past has seen the emergence of countless backbones\npretrained using various algorithms and datasets. While this abundance of\nchoice has led to performance increases for a range of systems, it is difficult\nfor practitioners to make informed decisions about which backbone to choose.\nBattle of the Backbones (BoB) makes this choice easier by benchmarking a\ndiverse suite of pretrained models, including vision-language models, those\ntrained via self-supervised learning, and the Stable Diffusion backbone, across\na diverse set of computer vision tasks ranging from classification to object\ndetection to OOD generalization and more. Furthermore, BoB sheds light on\npromising directions for the research community to advance computer vision by\nilluminating strengths and weakness of existing approaches through a\ncomprehensive analysis conducted on more than 1500 training runs. While vision\ntransformers (ViTs) and self-supervised learning (SSL) are increasingly\npopular, we find that convolutional neural networks pretrained in a supervised\nfashion on large training sets still perform best on most tasks among the\nmodels we consider. Moreover, in apples-to-apples comparisons on the same\narchitectures and similarly sized pretraining datasets, we find that SSL\nbackbones are highly competitive, indicating that future works should perform\nSSL pretraining with advanced architectures and larger pretraining datasets. We\nrelease the raw results of our experiments along with code that allows\nresearchers to put their own backbones through the gauntlet here:\nhttps://github.com/hsouri/Battle-of-the-Backbones\n","authors":["Micah Goldblum","Hossein Souri","Renkun Ni","Manli Shu","Viraj Prabhu","Gowthami Somepalli","Prithvijit Chattopadhyay","Mark Ibrahim","Adrien Bardes","Judy Hoffman","Rama Chellappa","Andrew Gordon Wilson","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2310.19909v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2107.11851v2","updated":"2023-11-20T02:14:36Z","published":"2021-07-25T17:24:50Z","title":"Transcript to Video: Efficient Clip Sequencing from Texts","summary":" Among numerous videos shared on the web, well-edited ones always attract more\nattention. However, it is difficult for inexperienced users to make well-edited\nvideos because it requires professional expertise and immense manual labor. To\nmeet the demands for non-experts, we present Transcript-to-Video -- a\nweakly-supervised framework that uses texts as input to automatically create\nvideo sequences from an extensive collection of shots. Specifically, we propose\na Content Retrieval Module and a Temporal Coherent Module to learn\nvisual-language representations and model shot sequencing styles, respectively.\nFor fast inference, we introduce an efficient search strategy for real-time\nvideo clip sequencing. Quantitative results and user studies demonstrate\nempirically that the proposed learning framework can retrieve content-relevant\nshots while creating plausible video sequences in terms of style. Besides, the\nrun-time performance analysis shows that our framework can support real-world\napplications.\n","authors":["Yu Xiong","Fabian Caba Heilbron","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2107.11851v2.pdf","comment":"Tech Report; Demo and project page at\n http://www.xiongyu.me/projects/transcript2video/"},{"id":"http://arxiv.org/abs/2311.09574v3","updated":"2023-11-20T02:01:33Z","published":"2023-11-16T05:17:14Z","title":"LymphoML: An interpretable artificial intelligence-based method\n identifies morphologic features that correlate with lymphoma subtype","summary":" The accurate classification of lymphoma subtypes using hematoxylin and eosin\n(H&E)-stained tissue is complicated by the wide range of morphological features\nthese cancers can exhibit. We present LymphoML - an interpretable machine\nlearning method that identifies morphologic features that correlate with\nlymphoma subtypes. Our method applies steps to process H&E-stained tissue\nmicroarray cores, segment nuclei and cells, compute features encompassing\nmorphology, texture, and architecture, and train gradient-boosted models to\nmake diagnostic predictions. LymphoML's interpretable models, developed on a\nlimited volume of H&E-stained tissue, achieve non-inferior diagnostic accuracy\nto pathologists using whole-slide images and outperform black box deep-learning\non a dataset of 670 cases from Guatemala spanning 8 lymphoma subtypes. Using\nSHapley Additive exPlanation (SHAP) analysis, we assess the impact of each\nfeature on model prediction and find that nuclear shape features are most\ndiscriminative for DLBCL (F1-score: 78.7%) and classical Hodgkin lymphoma\n(F1-score: 74.5%). Finally, we provide the first demonstration that a model\ncombining features from H&E-stained tissue with features from a standardized\npanel of 6 immunostains results in a similar diagnostic accuracy (85.3%) to a\n46-stain panel (86.1%).\n","authors":["Vivek Shankar","Xiaoli Yang","Vrishab Krishna","Brent Tan","Oscar Silva","Rebecca Rojansky","Andrew Ng","Fabiola Valvert","Edward Briercheck","David Weinstock","Yasodha Natkunam","Sebastian Fernandez-Pol","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2311.09574v3.pdf","comment":"To be published in Proceedings of the 3rd Machine Learning for Health\n symposium, Proceedings of Machine Learning Research (PMLR)"},{"id":"http://arxiv.org/abs/2305.10808v2","updated":"2023-11-20T02:00:02Z","published":"2023-05-18T08:42:41Z","title":"Manifold-Aware Self-Training for Unsupervised Domain Adaptation on\n Regressing 6D Object Pose","summary":" Domain gap between synthetic and real data in visual regression (e.g. 6D pose\nestimation) is bridged in this paper via global feature alignment and local\nrefinement on the coarse classification of discretized anchor classes in target\nspace, which imposes a piece-wise target manifold regularization into\ndomain-invariant representation learning. Specifically, our method incorporates\nan explicit self-supervised manifold regularization, revealing consistent\ncumulative target dependency across domains, to a self-training scheme (e.g.\nthe popular Self-Paced Self-Training) to encourage more discriminative\ntransferable representations of regression tasks. Moreover, learning unified\nimplicit neural functions to estimate relative direction and distance of\ntargets to their nearest class bins aims to refine target classification\npredictions, which can gain robust performance against inconsistent feature\nscaling sensitive to UDA regressors. Experiment results on three public\nbenchmarks of the challenging 6D pose estimation task can verify the\neffectiveness of our method, consistently achieving superior performance to the\nstate-of-the-art for UDA on 6D pose estimation.\n","authors":["Yichen Zhang","Jiehong Lin","Ke Chen","Zelin Xu","Yaowei Wang","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2305.10808v2.pdf","comment":"Accepted by IJCAI 2023"},{"id":"http://arxiv.org/abs/2311.10251v2","updated":"2023-11-20T01:59:11Z","published":"2023-11-17T00:44:56Z","title":"UniMOS: A Universal Framework For Multi-Organ Segmentation Over\n Label-Constrained Datasets","summary":" Machine learning models for medical images can help physicians diagnose and\nmanage diseases. However, due to the fact that medical image annotation\nrequires a great deal of manpower and expertise, as well as the fact that\nclinical departments perform image annotation based on task orientation, there\nis the problem of having fewer medical image annotation data with more\nunlabeled data and having many datasets that annotate only a single organ. In\nthis paper, we present UniMOS, the first universal framework for achieving the\nutilization of fully and partially labeled images as well as unlabeled images.\nSpecifically, we construct a Multi-Organ Segmentation (MOS) module over\nfully/partially labeled data as the basenet and designed a new target adaptive\nloss. Furthermore, we incorporate a semi-supervised training module that\ncombines consistent regularization and pseudolabeling techniques on unlabeled\ndata, which significantly improves the segmentation of unlabeled data.\nExperiments show that the framework exhibits excellent performance in several\nmedical image segmentation tasks compared to other advanced methods, and also\nsignificantly improves data utilization and reduces annotation cost. Code and\nmodels are available at: https://github.com/lw8807001/UniMOS.\n","authors":["Can Li","Sheng Shao","Junyi Qu","Shuchao Pang","Mehmet A. Orgun"],"pdf_url":"https://arxiv.org/pdf/2311.10251v2.pdf","comment":"Accepted by BIBM2023"},{"id":"http://arxiv.org/abs/2311.11477v1","updated":"2023-11-20T01:07:30Z","published":"2023-11-20T01:07:30Z","title":"What's left can't be right -- The remaining positional incompetence of\n contrastive vision-language models","summary":" Contrastive vision-language models like CLIP have been found to lack spatial\nunderstanding capabilities. In this paper we discuss the possible causes of\nthis phenomenon by analysing both datasets and embedding space. By focusing on\nsimple left-right positional relations, we show that this behaviour is entirely\npredictable, even with large-scale datasets, demonstrate that these relations\ncan be taught using synthetic data and show that this approach can generalise\nwell to natural images - improving the performance on left-right relations on\nVisual Genome Relations.\n","authors":["Nils Hoehing","Ellen Rushe","Anthony Ventresque"],"pdf_url":"https://arxiv.org/pdf/2311.11477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15308v2","updated":"2023-11-20T00:56:15Z","published":"2023-10-23T19:21:57Z","title":"SAM-CLIP: Merging Vision Foundation Models towards Semantic and Spatial\n Understanding","summary":" The landscape of publicly available vision foundation models (VFMs), such as\nCLIP and Segment Anything Model (SAM), is expanding rapidly. VFMs are endowed\nwith distinct capabilities stemming from their pre-training objectives. For\ninstance, CLIP excels in semantic understanding, while SAM specializes in\nspatial understanding for segmentation. In this work, we introduce a simple\nrecipe to efficiently merge VFMs into a unified model that absorbs their\nexpertise. Our method integrates techniques of multi-task learning, continual\nlearning, and distillation. Further, it demands significantly less\ncomputational cost compared to traditional multi-task training from scratch,\nand it only needs a small fraction of the pre-training datasets that were\ninitially used to train individual models. By applying our method to SAM and\nCLIP, we obtain SAM-CLIP: a unified model that combines the capabilities of SAM\nand CLIP into a single vision transformer. Compared with deploying SAM and CLIP\nindependently, our merged model, SAM-CLIP, reduces storage and compute costs\nfor inference, making it well-suited for edge device applications. We show that\nSAM-CLIP not only retains the foundational strengths of SAM and CLIP, but also\nintroduces synergistic functionalities, notably in zero-shot semantic\nsegmentation, where SAM-CLIP establishes new state-of-the-art results on 5\nbenchmarks. It outperforms previous models that are specifically designed for\nthis task by a large margin, including +6.8% and +5.9% mean IoU improvement on\nPascal-VOC and COCO-Stuff datasets, respectively.\n","authors":["Haoxiang Wang","Pavan Kumar Anasosalu Vasu","Fartash Faghri","Raviteja Vemulapalli","Mehrdad Farajtabar","Sachin Mehta","Mohammad Rastegari","Oncel Tuzel","Hadi Pouransari"],"pdf_url":"https://arxiv.org/pdf/2310.15308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12225v1","updated":"2023-11-20T22:43:07Z","published":"2023-11-20T22:43:07Z","title":"HandSight: DeCAF & Improved Fisher Vectors to Classify Clothing Color\n and Texture with a Finger-Mounted Camera","summary":" We demonstrate the use of DeCAF and Improved Fisher Vector image features to\nclassify clothing texture. The issue of choosing clothes is a problem for the\nblind every day. This work attempts to solve the issue with a finger-mounted\ncamera and state-of-the-art classification algorithms. To evaluate our\nsolution, we collected 520 close-up images across 29 pieces of clothing. We\ncontribute (1) the HCTD, an image dataset taken with a NanEyeGS camera, a\ncamera small enough to be mounted on the finger, and (2) evaluations of\nstate-of-the-art recognition algorithms applied to our dataset - achieving an\naccuracy >95%. Throughout the paper, we will discuss previous work, evaluate\nthe current work, and finally, suggest the project's future direction.\n","authors":["Alexander J. Medeiros","Lee Stearns","Jon E. Froehlich"],"pdf_url":"https://arxiv.org/pdf/2311.12225v1.pdf","comment":"10 pages, 15 figures"},{"id":"http://arxiv.org/abs/2311.06031v2","updated":"2023-11-20T22:25:15Z","published":"2023-11-10T12:38:16Z","title":"Diagonal Hierarchical Consistency Learning for Semi-supervised Medical\n Image Segmentation","summary":" Medical image segmentation, which is essential for many clinical\napplications, has achieved almost human-level performance via data-driven deep\nlearning techniques. Nevertheless, its performance is predicated upon the\ncostly process of manually annotating a vast amount of medical images. To this\nend, we propose a novel framework for robust semi-supervised medical image\nsegmentation using diagonal hierarchical consistency learning (DiHC-Net).\nFirst, it is composed of multiple sub-models with identical multi-scale\narchitecture but with distinct sub-layers, such as up-sampling and\nnormalisation layers. Second, along with mutual consistency, a novel diagonal\nhierarchical consistency is enforced between one model's intermediate and final\nprediction and other models' soft pseudo labels in a diagonal hierarchical\nfashion. Experimental results verify the efficacy of our simple framework,\noutperforming all previous approaches on public Left Atrium (LA) dataset.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2311.06031v2.pdf","comment":"5 pages, 2 figures, and 2 tables. Corrected typos and errors"},{"id":"http://arxiv.org/abs/2311.12202v1","updated":"2023-11-20T21:43:32Z","published":"2023-11-20T21:43:32Z","title":"Nepotistically Trained Generative-AI Models Collapse","summary":" Trained on massive amounts of human-generated content, AI (artificial\nintelligence) image synthesis is capable of reproducing semantically coherent\nimages that match the visual appearance of its training data. We show that when\nretrained on even small amounts of their own creation, these generative-AI\nmodels produce highly distorted images. We also show that this distortion\nextends beyond the text prompts used in retraining, and that once poisoned, the\nmodels struggle to fully heal even after retraining on only real images.\n","authors":["Matyas Bohacek","Hany Farid"],"pdf_url":"https://arxiv.org/pdf/2311.12202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12198v1","updated":"2023-11-20T21:34:52Z","published":"2023-11-20T21:34:52Z","title":"PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics","summary":" We introduce PhysGaussian, a new method that seamlessly integrates physically\ngrounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel\nmotion synthesis. Employing a custom Material Point Method (MPM), our approach\nenriches 3D Gaussian kernels with physically meaningful kinematic deformation\nand mechanical stress attributes, all evolved in line with continuum mechanics\nprinciples. A defining characteristic of our method is the seamless integration\nbetween physical simulation and visual rendering: both components utilize the\nsame 3D Gaussian kernels as their discrete representations. This negates the\nnecessity for triangle/tetrahedron meshing, marching cubes, \"cage meshes,\" or\nany other geometry embedding, highlighting the principle of \"what you see is\nwhat you simulate (WS$^2$).\" Our method demonstrates exceptional versatility\nacross a wide variety of materials--including elastic entities, metals,\nnon-Newtonian fluids, and granular materials--showcasing its strong\ncapabilities in creating diverse visual content with novel viewpoints and\nmovements. Our project page is at: https://xpandora.github.io/PhysGaussian/\n","authors":["Tianyi Xie","Zeshun Zong","Yuxin Qiu","Xuan Li","Yutao Feng","Yin Yang","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.12198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08577v2","updated":"2023-11-20T21:32:27Z","published":"2023-11-14T22:46:01Z","title":"Finding AI-Generated Faces in the Wild","summary":" AI-based image generation has continued to rapidly improve, producing\nincreasingly more realistic images with fewer obvious visual flaws.\nAI-generated images are being used to create fake online profiles which in turn\nare being used for spam, fraud, and disinformation campaigns. As the general\nproblem of detecting any type of manipulated or synthesized content is\nreceiving increasing attention, here we focus on a more narrow task of\ndistinguishing a real face from an AI-generated face. This is particularly\napplicable when tackling inauthentic online accounts with a fake user profile\nphoto. We show that by focusing on only faces, a more resilient and\ngeneral-purpose artifact can be detected that allows for the detection of\nAI-generated faces from a variety of GAN- and diffusion-based synthesis\nengines, and across image resolutions (as low as 128 x 128 pixels) and\nqualities.\n","authors":["Gonzalo J. Aniano Porcile","Jack Gindi","Shivansh Mundra","James R. Verbus","Hany Farid"],"pdf_url":"https://arxiv.org/pdf/2311.08577v2.pdf","comment":"Removed anonymization of the LinkedIn platform"},{"id":"http://arxiv.org/abs/2311.12194v1","updated":"2023-11-20T21:20:37Z","published":"2023-11-20T21:20:37Z","title":"DiffAvatar: Simulation-Ready Garment Optimization with Differentiable\n Simulation","summary":" The realism of digital avatars is crucial in enabling telepresence\napplications with self-expression and customization. A key aspect of this\nrealism originates from the physical accuracy of both a true-to-life body shape\nand clothing. While physical simulations can produce high-quality, realistic\nmotions for clothed humans, they require precise estimation of body shape and\nhigh-quality garment assets with associated physical parameters for cloth\nsimulations. However, manually creating these assets and calibrating their\nparameters is labor-intensive and requires specialized expertise. To address\nthis gap, we propose DiffAvatar, a novel approach that performs body and\ngarment co-optimization using differentiable simulation. By integrating\nphysical simulation into the optimization loop and accounting for the complex\nnonlinear behavior of cloth and its intricate interaction with the body, our\nframework recovers body and garment geometry and extracts important material\nparameters in a physically plausible way. Our experiments demonstrate that our\napproach generates realistic clothing and body shape that can be easily used in\ndownstream applications.\n","authors":["Yifei Li","Hsiao-yu Chen","Egor Larionov","Nikolaos Sarafianos","Wojciech Matusik","Tuur Stuyck"],"pdf_url":"https://arxiv.org/pdf/2311.12194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12193v1","updated":"2023-11-20T21:20:15Z","published":"2023-11-20T21:20:15Z","title":"Disentangling Structure and Appearance in ViT Feature Space","summary":" We present a method for semantically transferring the visual appearance of\none natural image to another. Specifically, our goal is to generate an image in\nwhich objects in a source structure image are \"painted\" with the visual\nappearance of their semantically related objects in a target appearance image.\nTo integrate semantic information into our framework, our key idea is to\nleverage a pre-trained and fixed Vision Transformer (ViT) model. Specifically,\nwe derive novel disentangled representations of structure and appearance\nextracted from deep ViT features. We then establish an objective function that\nsplices the desired structure and appearance representations, interweaving them\ntogether in the space of ViT features. Based on our objective function, we\npropose two frameworks of semantic appearance transfer -- \"Splice\", which works\nby training a generator on a single and arbitrary pair of structure-appearance\nimages, and \"SpliceNet\", a feed-forward real-time appearance transfer model\ntrained on a dataset of images from a specific domain. Our frameworks do not\ninvolve adversarial training, nor do they require any additional input\ninformation such as semantic segmentation or correspondences. We demonstrate\nhigh-resolution results on a variety of in-the-wild image pairs, under\nsignificant variations in the number of objects, pose, and appearance. Code and\nsupplementary material are available in our project page: splice-vit.github.io.\n","authors":["Narek Tumanyan","Omer Bar-Tal","Shir Amir","Shai Bagon","Tali Dekel"],"pdf_url":"https://arxiv.org/pdf/2311.12193v1.pdf","comment":"Accepted to ACM Transactions on Graphics. arXiv admin note:\n substantial text overlap with arXiv:2201.00424"},{"id":"http://arxiv.org/abs/2209.08660v2","updated":"2023-11-20T21:07:44Z","published":"2022-09-18T21:29:58Z","title":"Learn the Time to Learn: Replay Scheduling in Continual Learning","summary":" Replay methods are known to be successful at mitigating catastrophic\nforgetting in continual learning scenarios despite having limited access to\nhistorical data. However, storing historical data is cheap in many real-world\nsettings, yet replaying all historical data is often prohibited due to\nprocessing time constraints. In such settings, we propose that continual\nlearning systems should learn the time to learn and schedule which tasks to\nreplay at different time steps. We first demonstrate the benefits of our\nproposal by using Monte Carlo tree search to find a proper replay schedule, and\nshow that the found replay schedules can outperform fixed scheduling policies\nwhen combined with various replay methods in different continual learning\nsettings. Additionally, we propose a framework for learning replay scheduling\npolicies with reinforcement learning. We show that the learned policies can\ngeneralize better in new continual learning scenarios compared to equally\nreplaying all seen tasks, without added computational cost. Our study reveals\nthe importance of learning the time to learn in continual learning, which\nbrings current research closer to real-world needs.\n","authors":["Marcus Klasson","Hedvig Kjellström","Cheng Zhang"],"pdf_url":"https://arxiv.org/pdf/2209.08660v2.pdf","comment":"Published in TMLR (2023)"},{"id":"http://arxiv.org/abs/2311.02332v3","updated":"2023-11-20T20:55:29Z","published":"2023-11-04T05:42:51Z","title":"Multimodal Machine Learning in Image-Based and Clinical Biomedicine:\n Survey and Prospects","summary":" Machine learning (ML) applications in medical artificial intelligence (AI)\nsystems have shifted from traditional and statistical methods to increasing\napplication of deep learning models. This survey navigates the current\nlandscape of multimodal ML, focusing on its profound impact on medical image\nanalysis and clinical decision support systems. Emphasizing challenges and\ninnovations in addressing multimodal representation, fusion, translation,\nalignment, and co-learning, the paper explores the transformative potential of\nmultimodal models for clinical predictions. It also questions practical\nimplementation of such models, bringing attention to the dynamics between\ndecision support systems and healthcare providers. Despite advancements,\nchallenges such as data biases and the scarcity of \"big data\" in many\nbiomedical domains persist. We conclude with a discussion on effective\ninnovation and collaborative efforts to further the miss\n","authors":["Elisa Warner","Joonsang Lee","William Hsu","Tanveer Syeda-Mahmood","Charles Kahn","Olivier Gevaert","Arvind Rao"],"pdf_url":"https://arxiv.org/pdf/2311.02332v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12174v1","updated":"2023-11-20T20:40:24Z","published":"2023-11-20T20:40:24Z","title":"LABELMAKER: Automatic Semantic Label Generation from RGB-D Trajectories","summary":" Semantic annotations are indispensable to train or evaluate perception\nmodels, yet very costly to acquire. This work introduces a fully automated\n2D/3D labeling framework that, without any human intervention, can generate\nlabels for RGB-D scans at equal (or better) level of accuracy than comparable\nmanually annotated datasets such as ScanNet. Our approach is based on an\nensemble of state-of-the-art segmentation models and 3D lifting through neural\nrendering. We demonstrate the effectiveness of our LabelMaker pipeline by\ngenerating significantly better labels for the ScanNet datasets and\nautomatically labelling the previously unlabeled ARKitScenes dataset. Code and\nmodels are available at https://labelmaker.org\n","authors":["Silvan Weder","Hermann Blum","Francis Engelmann","Marc Pollefeys"],"pdf_url":"https://arxiv.org/pdf/2311.12174v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12161v1","updated":"2023-11-20T20:27:42Z","published":"2023-11-20T20:27:42Z","title":"ChemScraper: Graphics Extraction, Molecular Diagram Parsing, and\n Annotated Data Generation for PDF Images","summary":" Existing visual parsers for molecule diagrams translate pixel-based raster\nimages such as PNGs to chemical structure representations (e.g., SMILES).\nHowever, PDFs created by word processors including \\LaTeX{} and Word provide\nexplicit locations and shapes for characters, lines, and polygons. We\n%introduce a method to extract symbols from born-digital PDF molecule images\nand then apply simple graph transformations to capture both visual and chemical\nstructure in editable ChemDraw files (CDXML). Our fast ( PDF $\\rightarrow$\nvisual graph $\\rightarrow$ chemical graph ) pipeline does not require GPUs,\nOptical Character Recognition (OCR) or vectorization. We evaluate on standard\nbenchmarks using SMILES strings, along with a novel evaluation that provides\ngraph-based metrics and error compilation using LgEval. The geometric\ninformation in born-digital PDFs produces a highly accurate parser, motivating\ngenerating training data for visual parsers that recognize from raster images,\nwith extracted graphics, visual structure, and chemical structure as\nannotations. To do this we render SMILES strings in Indigo, parse molecule\nstructure, and then validate recognized structure to select correct files.\n","authors":["Ayush Kumar Shah","Bryan Manrique Amador","Abhisek Dey","Ming Creekmore","Blake Ocampo","Scott Denmark","Richard Zanibbi"],"pdf_url":"https://arxiv.org/pdf/2311.12161v1.pdf","comment":"20 pages without references, 10 figures, 3 Tables, submitted to\n International Journal on Document Analysis and Recognition (IJDAR)"},{"id":"http://arxiv.org/abs/2311.12159v1","updated":"2023-11-20T20:24:45Z","published":"2023-11-20T20:24:45Z","title":"Conditional Modeling Based Automatic Video Summarization","summary":" The aim of video summarization is to shorten videos automatically while\nretaining the key information necessary to convey the overall story. Video\nsummarization methods mainly rely on visual factors, such as visual\nconsecutiveness and diversity, which may not be sufficient to fully understand\nthe content of the video. There are other non-visual factors, such as\ninterestingness, representativeness, and storyline consistency that should also\nbe considered for generating high-quality video summaries. Current methods do\nnot adequately take into account these non-visual factors, resulting in\nsuboptimal performance. In this work, a new approach to video summarization is\nproposed based on insights gained from how humans create ground truth video\nsummaries. The method utilizes a conditional modeling perspective and\nintroduces multiple meaningful random variables and joint distributions to\ncharacterize the key components of video summarization. Helper distributions\nare employed to improve the training of the model. A conditional attention\nmodule is designed to mitigate potential performance degradation in the\npresence of multi-modal input. The proposed video summarization method\nincorporates the above innovative design choices that aim to narrow the gap\nbetween human-generated and machine-generated video summaries. Extensive\nexperiments show that the proposed approach outperforms existing methods and\nachieves state-of-the-art performance on commonly used video summarization\ndatasets.\n","authors":["Jia-Hong Huang","Chao-Han Huck Yang","Pin-Yu Chen","Min-Hung Chen","Marcel Worring"],"pdf_url":"https://arxiv.org/pdf/2311.12159v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n arXiv admin note: substantial text overlap with arXiv:2305.00455"},{"id":"http://arxiv.org/abs/2311.12157v1","updated":"2023-11-20T20:22:55Z","published":"2023-11-20T20:22:55Z","title":"Model-aware 3D Eye Gaze from Weak and Few-shot Supervisions","summary":" The task of predicting 3D eye gaze from eye images can be performed either by\n(a) end-to-end learning for image-to-gaze mapping or by (b) fitting a 3D eye\nmodel onto images. The former case requires 3D gaze labels, while the latter\nrequires eye semantics or landmarks to facilitate the model fitting. Although\nobtaining eye semantics and landmarks is relatively easy, fitting an accurate\n3D eye model on them remains to be very challenging due to its ill-posed nature\nin general. On the other hand, obtaining large-scale 3D gaze data is cumbersome\ndue to the required hardware setups and computational demands. In this work, we\npropose to predict 3D eye gaze from weak supervision of eye semantic\nsegmentation masks and direct supervision of a few 3D gaze vectors. The\nproposed method combines the best of both worlds by leveraging large amounts of\nweak annotations--which are easy to obtain, and only a few 3D gaze\nvectors--which alleviate the difficulty of fitting 3D eye models on the\nsemantic segmentation of eye images. Thus, the eye gaze vectors, used in the\nmodel fitting, are directly supervised using the few-shot gaze labels.\nAdditionally, we propose a transformer-based network architecture, that serves\nas a solid baseline for our improvements. Our experiments in diverse settings\nillustrate the significant benefits of the proposed method, achieving about 5\ndegrees lower angular gaze error over the baseline, when only 0.05% 3D\nannotations of the training images are used. The source code is available at\nhttps://github.com/dimitris-christodoulou57/Model-aware_3D_Eye_Gaze.\n","authors":["Nikola Popovic","Dimitrios Christodoulou","Danda Pani Paudel","Xi Wang","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.12157v1.pdf","comment":"Accepted to ISMAR2023 as a poster paper"},{"id":"http://arxiv.org/abs/2311.12153v1","updated":"2023-11-20T20:09:48Z","published":"2023-11-20T20:09:48Z","title":"Uncertainty Estimation in Contrast-Enhanced MR Image Translation with\n Multi-Axis Fusion","summary":" In recent years, deep learning has been applied to a wide range of medical\nimaging and image processing tasks. In this work, we focus on the estimation of\nepistemic uncertainty for 3D medical image-to-image translation. We propose a\nnovel model uncertainty quantification method, Multi-Axis Fusion (MAF), which\nrelies on the integration of complementary information derived from multiple\nviews on volumetric image data. The proposed approach is applied to the task of\nsynthesizing contrast enhanced T1-weighted images based on native T1, T2 and\nT2-FLAIR scans. The quantitative findings indicate a strong correlation\n($\\rho_{\\text healthy} = 0.89$) between the mean absolute image synthetization\nerror and the mean uncertainty score for our MAF method. Hence, we consider MAF\nas a promising approach to solve the highly relevant task of detecting\nsynthetization failures at inference time.\n","authors":["Ivo M. Baltruschat","Parvaneh Janbakhshi","Melanie Dohmen","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2311.12153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12151v1","updated":"2023-11-20T20:03:34Z","published":"2023-11-20T20:03:34Z","title":"Teaching Robots to Build Simulations of Themselves","summary":" Simulation enables robots to plan and estimate the outcomes of prospective\nactions without the need to physically execute them. We introduce a\nself-supervised learning framework to enable robots model and predict their\nmorphology, kinematics and motor control using only brief raw video data,\neliminating the need for extensive real-world data collection and kinematic\npriors. By observing their own movements, akin to humans watching their\nreflection in a mirror, robots learn an ability to simulate themselves and\npredict their spatial motion for various tasks. Our results demonstrate that\nthis self-learned simulation not only enables accurate motion planning but also\nallows the robot to detect abnormalities and recover from damage.\n","authors":["Yuhang Hu","Jiong Lin","Hod Lipson"],"pdf_url":"https://arxiv.org/pdf/2311.12151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09248v2","updated":"2023-11-20T19:48:39Z","published":"2023-04-13T22:04:30Z","title":"Real-Time Helmet Violation Detection in AI City Challenge 2023 with\n Genetic Algorithm-Enhanced YOLOv5","summary":" This research focuses on real-time surveillance systems as a means for\ntackling the issue of non-compliance with helmet regulations, a practice that\nconsiderably amplifies the risk for motorcycle drivers or riders. Despite the\nwell-established advantages of helmet usage, achieving widespread compliance\nremains challenging due to diverse contributing factors. To effectively address\nthis concern, real-time monitoring and enforcement of helmet laws have been\nproposed as a plausible solution. However, previous attempts at real-time\nhelmet violation detection have been hindered by their limited ability to\noperate in real-time. To overcome this limitation, the current paper introduces\na novel real-time helmet violation detection system that utilizes the YOLOv5\nsingle-stage object detection model. This model is trained on the 2023 NVIDIA\nAI City Challenge 2023 Track 5 dataset. The optimal hyperparameters for\ntraining the model are determined using genetic algorithms. Additionally, data\naugmentation and various sampling techniques are implemented to enhance the\nmodel's performance. The efficacy of the models is evaluated using precision,\nrecall, and mean Average Precision (mAP) metrics. The results demonstrate\nimpressive precision, recall, and mAP scores of 0.848, 0.599, and 0.641,\nrespectively for the training data. Furthermore, the model achieves notable mAP\nscore of 0.6667 for the test datasets, leading to a commendable 4th place rank\nin the public leaderboard. This innovative approach represents a notable\nbreakthrough in the field and holds immense potential to substantially enhance\nmotorcycle safety. By enabling real-time monitoring and enforcement\ncapabilities, this system has the capacity to contribute towards increased\ncompliance with helmet laws, thereby effectively reducing the risks faced by\nmotorcycle riders and passengers.\n","authors":["Elham Soltanikazemi","Ashwin Dhakal","Bijaya Kumar Hatuwal","Imad Eddine Toubal","Armstrong Aboah","Kannappan Palaniappan"],"pdf_url":"https://arxiv.org/pdf/2304.09248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12144v1","updated":"2023-11-20T19:45:27Z","published":"2023-11-20T19:45:27Z","title":"Applications of Large Scale Foundation Models for Autonomous Driving","summary":" Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007,\nautonomous driving has been the most active field of AI applications. Recently\npowered by large language models (LLMs), chat systems, such as chatGPT and\nPaLM, emerge and rapidly become a promising direction to achieve artificial\ngeneral intelligence (AGI) in natural language processing (NLP). There comes a\nnatural thinking that we could employ these abilities to reformulate autonomous\ndriving. By combining LLM with foundation models, it is possible to utilize the\nhuman knowledge, commonsense and reasoning to rebuild autonomous driving\nsystems from the current long-tailed AI dilemma. In this paper, we investigate\nthe techniques of foundation models and LLMs applied for autonomous driving,\ncategorized as simulation, world model, data annotation and planning or E2E\nsolutions etc.\n","authors":["Yu Huang","Yue Chen","Zhu Li"],"pdf_url":"https://arxiv.org/pdf/2311.12144v1.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2311.12128v1","updated":"2023-11-20T19:11:16Z","published":"2023-11-20T19:11:16Z","title":"Fingerspelling PoseNet: Enhancing Fingerspelling Translation with\n Pose-Based Transformer Models","summary":" We address the task of American Sign Language fingerspelling translation\nusing videos in the wild. We exploit advances in more accurate hand pose\nestimation and propose a novel architecture that leverages the transformer\nbased encoder-decoder model enabling seamless contextual word translation. The\ntranslation model is augmented by a novel loss term that accurately predicts\nthe length of the finger-spelled word, benefiting both training and inference.\nWe also propose a novel two-stage inference approach that re-ranks the\nhypotheses using the language model capabilities of the decoder. Through\nextensive experiments, we demonstrate that our proposed method outperforms the\nstate-of-the-art models on ChicagoFSWild and ChicagoFSWild+ achieving more than\n10% relative improvement in performance. Our findings highlight the\neffectiveness of our approach and its potential to advance fingerspelling\nrecognition in sign language translation. Code is also available at\nhttps://github.com/pooyafayyaz/Fingerspelling-PoseNet.\n","authors":["Pooya Fayyazsanavi","Negar Nejatishahidin","Jana Kosecka"],"pdf_url":"https://arxiv.org/pdf/2311.12128v1.pdf","comment":"WACV 2024"},{"id":"http://arxiv.org/abs/2311.12125v1","updated":"2023-11-20T19:05:57Z","published":"2023-11-20T19:05:57Z","title":"Mixing-Denoising Generalizable Occupancy Networks","summary":" While current state-of-the-art generalizable implicit neural shape models\nrely on the inductive bias of convolutions, it is still not entirely clear how\nproperties emerging from such biases are compatible with the task of 3D\nreconstruction from point cloud. We explore an alternative approach to\ngeneralizability in this context. We relax the intrinsic model bias (i.e. using\nMLPs to encode local features as opposed to convolutions) and constrain the\nhypothesis space instead with an auxiliary regularization related to the\nreconstruction task, i.e. denoising. The resulting model is the first only-MLP\nlocally conditioned implicit shape reconstruction from point cloud network with\nfast feed forward inference. Point cloud borne features and denoising offsets\nare predicted from an exclusively MLP-made network in a single forward pass. A\ndecoder predicts occupancy probabilities for queries anywhere in space by\npooling nearby features from the point cloud order-invariantly, guided by\ndenoised relative positional encoding. We outperform the state-of-the-art\nconvolutional method while using half the number of model parameters.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2311.12125v1.pdf","comment":"3DV 2024"},{"id":"http://arxiv.org/abs/2308.01981v3","updated":"2023-11-20T19:04:02Z","published":"2023-08-03T18:28:50Z","title":"CartiMorph: a framework for automated knee articular cartilage\n morphometrics","summary":" We introduce CartiMorph, a framework for automated knee articular cartilage\nmorphometrics. It takes an image as input and generates quantitative metrics\nfor cartilage subregions, including the percentage of full-thickness cartilage\nloss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the\npower of deep learning models for hierarchical image feature representation.\nDeep learning models were trained and validated for tissue segmentation,\ntemplate construction, and template-to-image registration. We established\nmethods for surface-normal-based cartilage thickness mapping, FCL estimation,\nand rule-based cartilage parcellation. Our cartilage thickness map showed less\nerror in thin and peripheral regions. We evaluated the effectiveness of the\nadopted segmentation model by comparing the quantitative metrics obtained from\nmodel segmentation and those from manual segmentation. The root-mean-squared\ndeviation of the FCL measurements was less than 8%, and strong correlations\nwere observed for the mean thickness (Pearson's correlation coefficient $\\rho\n\\in [0.82,0.97]$), surface area ($\\rho \\in [0.82,0.98]$) and volume ($\\rho \\in\n[0.89,0.98]$) measurements. We compared our FCL measurements with those from a\nprevious study and found that our measurements deviated less from the ground\ntruths. We observed superior performance of the proposed rule-based cartilage\nparcellation method compared with the atlas-based approach. CartiMorph has the\npotential to promote imaging biomarkers discovery for knee osteoarthritis.\n","authors":["Yongcheng Yao","Junru Zhong","Liping Zhang","Sheheryar Khan","Weitian Chen"],"pdf_url":"https://arxiv.org/pdf/2308.01981v3.pdf","comment":"This preprint is an proofread version of a paper published in Medical\n Image Analysis (2023), which can be found at\n https://doi.org/10.1016/j.media.2023.103035"},{"id":"http://arxiv.org/abs/2311.12092v1","updated":"2023-11-20T18:59:01Z","published":"2023-11-20T18:59:01Z","title":"Concept Sliders: LoRA Adaptors for Precise Control in Diffusion Models","summary":" We present a method to create interpretable concept sliders that enable\nprecise control over attributes in image generations from diffusion models. Our\napproach identifies a low-rank parameter direction corresponding to one concept\nwhile minimizing interference with other attributes. A slider is created using\na small set of prompts or sample images; thus slider directions can be created\nfor either textual or visual concepts. Concept Sliders are plug-and-play: they\ncan be composed efficiently and continuously modulated, enabling precise\ncontrol over image generation. In quantitative experiments comparing to\nprevious editing techniques, our sliders exhibit stronger targeted edits with\nlower interference. We showcase sliders for weather, age, styles, and\nexpressions, as well as slider compositions. We show how sliders can transfer\nlatents from StyleGAN for intuitive editing of visual concepts for which\ntextual description is difficult. We also find that our method can help address\npersistent quality issues in Stable Diffusion XL including repair of object\ndeformations and fixing distorted hands. Our code, data, and trained sliders\nare available at https://sliders.baulab.info/\n","authors":["Rohit Gandikota","Joanna Materzynska","Tingrui Zhou","Antonio Torralba","David Bau"],"pdf_url":"https://arxiv.org/pdf/2311.12092v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12091v1","updated":"2023-11-20T18:49:58Z","published":"2023-11-20T18:49:58Z","title":"DAS: A Deformable Attention to Capture Salient Information in CNNs","summary":" Convolutional Neural Networks (CNNs) excel in local spatial pattern\nrecognition. For many vision tasks, such as object recognition and\nsegmentation, salient information is also present outside CNN's kernel\nboundaries. However, CNNs struggle in capturing such relevant information due\nto their confined receptive fields. Self-attention can improve a model's access\nto global information but increases computational overhead. We present a fast\nand simple fully convolutional method called DAS that helps focus attention on\nrelevant information. It uses deformable convolutions for the location of\npertinent image regions and separable convolutions for efficiency. DAS plugs\ninto existing CNNs and propagates relevant information using a gating\nmechanism. Compared to the O(n^2) computational complexity of transformer-style\nattention, DAS is O(n). Our claim is that DAS's ability to pay increased\nattention to relevant features results in performance improvements when added\nto popular CNNs for Image Classification and Object Detection. For example, DAS\nyields an improvement on Stanford Dogs (4.47%), ImageNet (1.91%), and COCO AP\n(3.3%) with base ResNet50 backbone. This outperforms other CNN attention\nmechanisms while using similar or less FLOPs. Our code will be publicly\navailable.\n","authors":["Farzad Salajegheh","Nader Asadi","Soroush Saryazdi","Sudhir Mudur"],"pdf_url":"https://arxiv.org/pdf/2311.12091v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.11824v1","updated":"2023-11-20T15:01:33Z","published":"2023-11-20T15:01:33Z","title":"Graph Variational Embedding Collaborative Filtering","summary":" The customization of recommended content to users holds significant\nimportance in enhancing user experiences across a wide spectrum of applications\nsuch as e-commerce, music, and shopping. Graph-based methods have achieved\nconsiderable performance by capturing user-item interactions. However, these\nmethods tend to utilize randomly constructed embeddings in the dataset used for\ntraining the recommender, which lacks any user preferences. Here, we propose\nthe concept of variational embeddings as a means of pre-training the\nrecommender system to improve the feature propagation through the layers of\ngraph convolutional networks (GCNs). The graph variational embedding\ncollaborative filtering (GVECF) is introduced as a novel framework to\nincorporate representations learned through a variational graph auto-encoder\nwhich are embedded into a GCN-based collaborative filtering. This approach\neffectively transforms latent high-order user-item interactions into more\ntrainable vectors, ultimately resulting in better performance in terms of\nrecall and normalized discounted cumulative gain(NDCG) metrics. The experiments\nconducted on benchmark datasets demonstrate that our proposed method achieves\nup to 13.78% improvement in the recall over the test data.\n","authors":["Narges Sadat Fazeli Dehkordi","Hadi Zare","Parham Moradi","Mahdi Jalili"],"pdf_url":"https://arxiv.org/pdf/2311.11824v1.pdf","comment":"Submitted for PAKDD2024 conference,12 pages"},{"id":"http://arxiv.org/abs/2311.11701v1","updated":"2023-11-20T12:08:32Z","published":"2023-11-20T12:08:32Z","title":"Control in Hybrid Chatbots","summary":" Customer data typically is held in database systems, which can be seen as\nrule-based knowledge base, whereas businesses increasingly want to benefit from\nthe capabilities of large, pre-trained language models.\n In this technical report, we describe a case study of how a commercial rule\nengine and an integrated neural chatbot may be integrated, and what level of\ncontrol that particular integration mode leads to. We also discuss alternative\nways (including past ways realized in other systems) how researchers strive to\nmaintain control and avoid what has recently been called model \"hallucination\".\n","authors":["Thomas Rüdel","Jochen L. Leidner"],"pdf_url":"https://arxiv.org/pdf/2311.11701v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.11691v1","updated":"2023-11-20T11:44:01Z","published":"2023-11-20T11:44:01Z","title":"Towards Robust Text Retrieval with Progressive Learning","summary":" Retrieval augmentation has become an effective solution to empower large\nlanguage models (LLMs) with external and verified knowledge sources from the\ndatabase, which overcomes the limitations and hallucinations of LLMs in\nhandling up-to-date and domain-specific information. However, existing\nembedding models for text retrieval usually have three non-negligible\nlimitations. First, the number and diversity of samples in a batch are too\nrestricted to supervise the modeling of textual nuances at scale. Second, the\nhigh proportional noise are detrimental to the semantic correctness and\nconsistency of embeddings. Third, the equal treatment to easy and difficult\nsamples would cause sub-optimum convergence of embeddings with poorer\ngeneralization. In this paper, we propose the PEG, a progressively learned\nembeddings for robust text retrieval. Specifically, we increase the training\nin-batch negative samples to 80,000, and for each query, we extracted five hard\nnegatives. Concurrently, we incorporated a progressive learning mechanism,\nenabling the model to dynamically modulate its attention to the samples\nthroughout the entire training process. Additionally, PEG is trained on more\nthan 100 million data, encompassing a wide range of domains (e.g., finance,\nmedicine, and tourism) and covering various tasks (e.g., question-answering,\nmachine reading comprehension, and similarity matching). Extensive experiments\nconducted on C-MTEB and DuReader demonstrate that PEG surpasses\nstate-of-the-art embeddings in retrieving true positives, highlighting its\nsignificant potential for applications in LLMs. Our model is publicly available\nat https://huggingface.co/TownsWu/PEG.\n","authors":["Tong Wu","Yulei Qin","Enwei Zhang","Zihan Xu","Yuting Gao","Ke Li","Xing Sun"],"pdf_url":"https://arxiv.org/pdf/2311.11691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08302v2","updated":"2023-11-20T23:52:58Z","published":"2023-11-14T16:46:10Z","title":"Inverse Learning with Extremely Sparse Feedback for Recommendation","summary":" Modern personalized recommendation services often rely on user feedback,\neither explicit or implicit, to improve the quality of services. Explicit\nfeedback refers to behaviors like ratings, while implicit feedback refers to\nbehaviors like user clicks. However, in the scenario of full-screen video\nviewing experiences like Tiktok and Reels, the click action is absent,\nresulting in unclear feedback from users, hence introducing noises in modeling\ntraining. Existing approaches on de-noising recommendation mainly focus on\npositive instances while ignoring the noise in a large amount of sampled\nnegative feedback. In this paper, we propose a meta-learning method to annotate\nthe unlabeled data from loss and gradient perspectives, which considers the\nnoises in both positive and negative instances. Specifically, we first propose\nan Inverse Dual Loss (IDL) to boost the true label learning and prevent the\nfalse label learning. Then we further propose an Inverse Gradient (IG) method\nto explore the correct updating gradient and adjust the updating based on\nmeta-learning. Finally, we conduct extensive experiments on both benchmark and\nindustrial datasets where our proposed method can significantly improve AUC by\n9.25% against state-of-the-art methods. Further analysis verifies the proposed\ninverse learning framework is model-agnostic and can improve a variety of\nrecommendation backbones. The source code, along with the best hyper-parameter\nsettings, is available at this link:\nhttps://github.com/Guanyu-Lin/InverseLearning.\n","authors":["Guanyu Lin","Chen Gao","Yu Zheng","Yinfeng Li","Jianxin Chang","Yanan Niu","Yang Song","Kun Gai","Zhiheng Li","Depeng Jin","Yong Li"],"pdf_url":"https://arxiv.org/pdf/2311.08302v2.pdf","comment":"WSDM 2024"},{"id":"http://arxiv.org/abs/2311.12159v1","updated":"2023-11-20T20:24:45Z","published":"2023-11-20T20:24:45Z","title":"Conditional Modeling Based Automatic Video Summarization","summary":" The aim of video summarization is to shorten videos automatically while\nretaining the key information necessary to convey the overall story. Video\nsummarization methods mainly rely on visual factors, such as visual\nconsecutiveness and diversity, which may not be sufficient to fully understand\nthe content of the video. There are other non-visual factors, such as\ninterestingness, representativeness, and storyline consistency that should also\nbe considered for generating high-quality video summaries. Current methods do\nnot adequately take into account these non-visual factors, resulting in\nsuboptimal performance. In this work, a new approach to video summarization is\nproposed based on insights gained from how humans create ground truth video\nsummaries. The method utilizes a conditional modeling perspective and\nintroduces multiple meaningful random variables and joint distributions to\ncharacterize the key components of video summarization. Helper distributions\nare employed to improve the training of the model. A conditional attention\nmodule is designed to mitigate potential performance degradation in the\npresence of multi-modal input. The proposed video summarization method\nincorporates the above innovative design choices that aim to narrow the gap\nbetween human-generated and machine-generated video summaries. Extensive\nexperiments show that the proposed approach outperforms existing methods and\nachieves state-of-the-art performance on commonly used video summarization\ndatasets.\n","authors":["Jia-Hong Huang","Chao-Han Huck Yang","Pin-Yu Chen","Min-Hung Chen","Marcel Worring"],"pdf_url":"https://arxiv.org/pdf/2311.12159v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n arXiv admin note: substantial text overlap with arXiv:2305.00455"},{"id":"http://arxiv.org/abs/2311.12136v1","updated":"2023-11-20T19:37:57Z","published":"2023-11-20T19:37:57Z","title":"Multi-view Graph Convolution for Participant Recommendation","summary":" Social networks have become essential for people's lives. The proliferation\nof web services further expands social networks at an unprecedented scale,\nleading to immeasurable commercial value for online platforms. Recently, the\ngroup buying (GB) business mode is prevalent and also becoming more popular in\nE-commerce. GB explicitly forms groups of users with similar interests to\nsecure better discounts from the merchants, often operating within social\nnetworks. It is a novel way to further unlock the commercial value by\nexplicitly utilizing the online social network in E-commerce. Participant\nrecommendation, a fundamental problem emerging together with GB, aims to find\nthe participants for a launched group buying process with an initiator and a\ntarget item to increase the GB success rate. This paper proposes Multi-View\nGraph Convolution for Participant Recommendation (MVPRec) to tackle this\nproblem. To differentiate the roles of users (Initiator/Participant) within the\nGB process, we explicitly reconstruct historical GB data into initiator-view\nand participant-view graphs. Together with the social graph, we obtain a\nmulti-view user representation with graph encoders. Then MVPRec fuses the GB\nand social representation with an attention module to obtain the user\nrepresentation and learns a matching score with the initiator's social friends\nvia a multi-head attention mechanism. Social friends with the Top-k matching\nscore are recommended for the corresponding GB process. Experiments on three\ndatasets justify the effectiveness of MVPRec in the emerging participant\nrecommendation problem.\n","authors":["Xiaolong Liu","Liangwei Yang","Chen Wang","Mingdai Yang","Zhiwei Liu","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12136v1.pdf","comment":"10 pages, 5 figures, 2023 IEEE International Conference on Big Data"},{"id":"http://arxiv.org/abs/2311.14729v1","updated":"2023-11-20T18:03:08Z","published":"2023-11-20T18:03:08Z","title":"App for Resume-Based Job Matching with Speech Interviews and Grammar\n Analysis: A Review","summary":" Through the advancement in natural language processing (NLP), specifically in\nspeech recognition, fully automated complex systems functioning on voice input\nhave started proliferating in areas such as home automation. These systems have\nbeen termed Automatic Speech Recognition Systems (ASR). In this review paper,\nwe explore the feasibility of an end-to-end system providing speech and text\nbased natural language processing for job interview preparation as well as\nrecommendation of relevant job postings. We also explore existing\nrecommender-based systems and note their limitations. This literature review\nwould help us identify the approaches and limitations of the various similar\nuse-cases of NLP technology for our upcoming project.\n","authors":["Tanmay Kulkarni","Yuvraj Pardeshi","Yash Shah","Vaishnvi Sakat","Sapana Bhirud"],"pdf_url":"https://arxiv.org/pdf/2311.14729v1.pdf","comment":"4 pages, 2 figures, literature review"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.12028v1","updated":"2023-11-20T18:59:51Z","published":"2023-11-20T18:59:51Z","title":"Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose\n Estimation","summary":" Transformers have been successfully applied in the field of video-based 3D\nhuman pose estimation. However, the high computational costs of these video\npose transformers (VPTs) make them impractical on resource-constrained devices.\nIn this paper, we present a plug-and-play pruning-and-recovering framework,\ncalled Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose\nestimation from videos. Our HoT begins with pruning pose tokens of redundant\nframes and ends with recovering full-length tokens, resulting in a few pose\ntokens in the intermediate transformer blocks and thus improving the model\nefficiency. To effectively achieve this, we propose a token pruning cluster\n(TPC) that dynamically selects a few representative tokens with high semantic\ndiversity while eliminating the redundancy of video frames. In addition, we\ndevelop a token recovering attention (TRA) to restore the detailed\nspatio-temporal information based on the selected tokens, thereby expanding the\nnetwork output to the original full-length temporal resolution for fast\ninference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and\nMPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and\nestimation accuracy compared to the original VPT models. For instance, applying\nto MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs\nwithout sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop,\nrespectively. Our source code will be open-sourced.\n","authors":["Wenhao Li","Mengyuan Liu","Hong Liu","Pichao Wang","Jialun Cai","Nicu Sebe"],"pdf_url":"https://arxiv.org/pdf/2311.12028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12023v1","updated":"2023-11-20T18:57:41Z","published":"2023-11-20T18:57:41Z","title":"LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient\n Language Model Finetuning","summary":" We propose a simple approach for memory-efficient adaptation of pretrained\nlanguage models. Our approach uses an iterative algorithm to decompose each\npretrained matrix into a high-precision low-rank component and a\nmemory-efficient quantized component. During finetuning, the quantized\ncomponent remains fixed and only the low-rank component is updated. We present\nan integer linear programming formulation of the quantization component which\nenables dynamic configuration of quantization parameters (e.g., bit-width,\nblock size) for each matrix given an overall target memory budget. We further\nexplore a data-aware version of the algorithm which uses an approximation of\nthe Fisher information matrix to weight the reconstruction objective during\nmatrix decomposition. Experiments on adapting RoBERTa and LLaMA-2 (7B and 70B)\ndemonstrate that our low-rank plus quantized matrix decomposition approach\n(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and moreover enables\nmore aggressive quantization. For example, on the OpenAssistant benchmark\nLQ-LoRA is able to learn a 2.5-bit LLaMA-2 model that is competitive with a\nmodel finetuned with 4-bit QLoRA. When finetuned on a language modeling\ncalibration dataset, LQ-LoRA can also be used for model compression; in this\nsetting our 2.75-bit LLaMA-2-70B model (which has 2.85 bits on average when\nincluding the low-rank components and requires 27GB of GPU memory) is\ncompetitive with the original model in full precision.\n","authors":["Han Guo","Philip Greengard","Eric P. Xing","Yoon Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15127v2","updated":"2023-11-20T18:51:29Z","published":"2023-10-23T17:31:55Z","title":"Open-Ended Instructable Embodied Agents with Memory-Augmented Large\n Language Models","summary":" Pre-trained and frozen large language models (LLMs) can effectively map\nsimple scene rearrangement instructions to programs over a robot's visuomotor\nfunctions through appropriate few-shot example prompting. To parse open-domain\nnatural language and adapt to a user's idiosyncratic procedures, not known\nduring prompt engineering time, fixed prompts fall short. In this paper, we\nintroduce HELPER, an embodied agent equipped with an external memory of\nlanguage-program pairs that parses free-form human-robot dialogue into action\nprograms through retrieval-augmented LLM prompting: relevant memories are\nretrieved based on the current dialogue, instruction, correction, or VLM\ndescription, and used as in-context prompt examples for LLM querying. The\nmemory is expanded during deployment to include pairs of user's language and\naction plans, to assist future inferences and personalize them to the user's\nlanguage and routines. HELPER sets a new state-of-the-art in the TEACh\nbenchmark in both Execution from Dialog History (EDH) and Trajectory from\nDialogue (TfD), with a 1.7x improvement over the previous state-of-the-art for\nTfD. Our models, code, and video results can be found in our project's website:\nhttps://helper-agent-llm.github.io.\n","authors":["Gabriel Sarch","Yue Wu","Michael J. Tarr","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2310.15127v2.pdf","comment":"Project page with code & videos: https://helper-agent-llm.github.io"},{"id":"http://arxiv.org/abs/2311.12004v1","updated":"2023-11-20T18:36:10Z","published":"2023-11-20T18:36:10Z","title":"Risk-averse Batch Active Inverse Reward Design","summary":" Designing a perfect reward function that depicts all the aspects of the\nintended behavior is almost impossible, especially generalizing it outside of\nthe training environments. Active Inverse Reward Design (AIRD) proposed the use\nof a series of queries, comparing possible reward functions in a single\ntraining environment. This allows the human to give information to the agent\nabout suboptimal behaviors, in order to compute a probability distribution over\nthe intended reward function. However, it ignores the possibility of unknown\nfeatures appearing in real-world environments, and the safety measures needed\nuntil the agent completely learns the reward function. I improved this method\nand created Risk-averse Batch Active Inverse Reward Design (RBAIRD), which\nconstructs batches, sets of environments the agent encounters when being used\nin the real world, processes them sequentially, and, for a predetermined number\nof iterations, asks queries that the human needs to answer for each environment\nof the batch. After this process is completed in one batch, the probabilities\nhave been improved and are transferred to the next batch. This makes it capable\nof adapting to real-world scenarios and learning how to treat unknown features\nit encounters for the first time. I also integrated a risk-averse planner,\nsimilar to that of Inverse Reward Design (IRD), which samples a set of reward\nfunctions from the probability distribution and computes a trajectory that\ntakes the most certain rewards possible. This ensures safety while the agent is\nstill learning the reward function, and enables the use of this approach in\nsituations where cautiousness is vital. RBAIRD outperformed the previous\napproaches in terms of efficiency, accuracy, and action certainty, demonstrated\nquick adaptability to new, unknown features, and can be more widely used for\nthe alignment of crucial, powerful AI models.\n","authors":["Panagiotis Liampas"],"pdf_url":"https://arxiv.org/pdf/2311.12004v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.11995v1","updated":"2023-11-20T18:26:01Z","published":"2023-11-20T18:26:01Z","title":"BrainWash: A Poisoning Attack to Forget in Continual Learning","summary":" Continual learning has gained substantial attention within the deep learning\ncommunity, offering promising solutions to the challenging problem of\nsequential learning. Yet, a largely unexplored facet of this paradigm is its\nsusceptibility to adversarial attacks, especially with the aim of inducing\nforgetting. In this paper, we introduce \"BrainWash,\" a novel data poisoning\nmethod tailored to impose forgetting on a continual learner. By adding the\nBrainWash noise to a variety of baselines, we demonstrate how a trained\ncontinual learner can be induced to forget its previously learned tasks\ncatastrophically, even when using these continual learning baselines. An\nimportant feature of our approach is that the attacker requires no access to\nprevious tasks' data and is armed merely with the model's current parameters\nand the data belonging to the most recent task. Our extensive experiments\nhighlight the efficacy of BrainWash, showcasing degradation in performance\nacross various regularization-based continual learning methods.\n","authors":["Ali Abbasi","Parsa Nooralinejad","Hamed Pirsiavash","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2311.11995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11992v1","updated":"2023-11-20T18:23:41Z","published":"2023-11-20T18:23:41Z","title":"Exploring Lip Segmentation Techniques in Computer Vision: A Comparative\n Analysis","summary":" Lip segmentation is crucial in computer vision, especially for lip reading.\nDespite extensive face segmentation research, lip segmentation has received\nlimited attention. The aim of this study is to compare state-of-the-art lip\nsegmentation models using a standardized setting and a publicly available\ndataset. Five techniques, namely EHANet, Mask2Former, BiSeNet V2, PIDNet, and\nSTDC1, are qualitatively selected based on their reported performance,\ninference time, code availability, recency, and popularity. The CelebAMask-HQ\ndataset, comprising manually annotated face images, is used to fairly assess\nthe lip segmentation performance of the selected models. Inference experiments\nare conducted on a Raspberry Pi4 to emulate limited computational resources.\nThe results show that Mask2Former and EHANet have the best performances in\nterms of mIoU score. BiSeNet V2 demonstrate competitive performance, while\nPIDNet excels in recall but has lower precision. Most models present inference\ntime ranging from 1000 to around 3000 milliseconds on a Raspberry Pi4, with\nPIDNet having the lowest mean inference time. This study provides a\ncomprehensive evaluation of lip segmentation models, highlighting their\nperformance and inference times. The findings contribute to the development of\nlightweight techniques and establish benchmarks for future advances in lip\nsegmentation, especially in IoT and edge computing scenarios.\n","authors":["Pietro B. S. Masur","Francisco Braulio Oliveira","Lucas Moreira Medino","Emanuel Huber","Milene Haraguchi Padilha","Cassio de Alcantara","Renata Sellaro"],"pdf_url":"https://arxiv.org/pdf/2311.11992v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11990v1","updated":"2023-11-20T18:21:53Z","published":"2023-11-20T18:21:53Z","title":"Machine-Learned Atomic Cluster Expansion Potentials for Fast and\n Quantum-Accurate Thermal Simulations of Wurtzite AlN","summary":" Using the atomic cluster expansion (ACE) framework, we develop a machine\nlearning interatomic potential for fast and accurately modelling the phonon\ntransport properties of wurtzite aluminum nitride. The predictive power of the\nACE potential against density functional theory (DFT) is demonstrated across a\nbroad range of properties of w-AlN, including ground-state lattice parameters,\nspecific heat capacity, coefficients of thermal expansion, bulk modulus, and\nharmonic phonon dispersions. Validation of lattice thermal conductivity is\nfurther carried out by comparing the ACE-predicted values to the DFT\ncalculations and experiments, exhibiting the overall capability of our ACE\npotential in sufficiently describing anharmonic phonon interactions. As a\npractical application, we perform a lattice dynamics analysis using the\npotential to unravel the effects of biaxial strains on thermal conductivity and\nphonon properties of w-AlN, which is identified as a significant tuning factor\nfor near-junction thermal design of w-AlN-based electronics.\n","authors":["Guang Yang","Yuan-Bin Liu","Lei Yang","Bing-Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2311.11990v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11980v1","updated":"2023-11-20T18:14:53Z","published":"2023-11-20T18:14:53Z","title":"Leveraging Previous Facial Action Units Knowledge for Emotion\n Recognition on Faces","summary":" People naturally understand emotions, thus permitting a machine to do the\nsame could open new paths for human-computer interaction. Facial expressions\ncan be very useful for emotion recognition techniques, as these are the biggest\ntransmitters of non-verbal cues capable of being correlated with emotions.\nSeveral techniques are based on Convolutional Neural Networks (CNNs) to extract\ninformation in a machine learning process. However, simple CNNs are not always\nsufficient to locate points of interest on the face that can be correlated with\nemotions. In this work, we intend to expand the capacity of emotion recognition\ntechniques by proposing the usage of Facial Action Units (AUs) recognition\ntechniques to recognize emotions. This recognition will be based on the Facial\nAction Coding System (FACS) and computed by a machine learning system. In\nparticular, our method expands over EmotiRAM, an approach for multi-cue emotion\nrecognition, in which we improve over their facial encoding module.\n","authors":["Pietro B. S. Masur","Willams Costa","Lucas S. Figueredo","Veronica Teichrieb"],"pdf_url":"https://arxiv.org/pdf/2311.11980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11974v1","updated":"2023-11-20T18:02:20Z","published":"2023-11-20T18:02:20Z","title":"Evaluating Supervision Levels Trade-Offs for Infrared-Based People\n Counting","summary":" Object detection models are commonly used for people counting (and\nlocalization) in many applications but require a dataset with costly bounding\nbox annotations for training. Given the importance of privacy in people\ncounting, these models rely more and more on infrared images, making the task\neven harder. In this paper, we explore how weaker levels of supervision can\naffect the performance of deep person counting architectures for image\nclassification and point-level localization. Our experiments indicate that\ncounting people using a CNN Image-Level model achieves competitive results with\nYOLO detectors and point-level models, yet provides a higher frame rate and a\nsimilar amount of model parameters.\n","authors":["David Latortue","Moetez Kdayem","Fidel A Guerrero Peña","Eric Granger","Marco Pedersoli"],"pdf_url":"https://arxiv.org/pdf/2311.11974v1.pdf","comment":"Accepted in IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV) 2024"},{"id":"http://arxiv.org/abs/2311.11973v1","updated":"2023-11-20T18:01:29Z","published":"2023-11-20T18:01:29Z","title":"Adaptive Training Distributions with Scalable Online Bilevel\n Optimization","summary":" Large neural networks pretrained on web-scale corpora are central to modern\nmachine learning. In this paradigm, the distribution of the large,\nheterogeneous pretraining data rarely matches that of the application domain.\nThis work considers modifying the pretraining distribution in the case where\none has a small sample of data reflecting the targeted test conditions. We\npropose an algorithm motivated by a recent formulation of this setting as an\nonline, bilevel optimization problem. With scalability in mind, our algorithm\nprioritizes computing gradients at training points which are likely to most\nimprove the loss on the targeted distribution. Empirically, we show that in\nsome cases this approach is beneficial over existing strategies from the domain\nadaptation literature but may not succeed in other cases. We propose a simple\ntest to evaluate when our approach can be expected to work well and point\ntowards further research to address current limitations.\n","authors":["David Grangier","Pierre Ablin","Awni Hannun"],"pdf_url":"https://arxiv.org/pdf/2311.11973v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04292v2","updated":"2023-11-20T17:45:37Z","published":"2023-03-07T23:54:35Z","title":"ERUDITE: Human-in-the-Loop IoT for an Adaptive Personalized Learning\n System","summary":" Thanks to the rapid growth in wearable technologies and recent advancement in\nmachine learning and signal processing, monitoring complex human contexts\nbecomes feasible, paving the way to develop human-in-the-loop IoT systems that\nnaturally evolve to adapt to the human and environment state autonomously.\nNevertheless, a central challenge in designing many of these IoT systems arises\nfrom the requirement to infer the human mental state, such as intention,\nstress, cognition load, or learning ability. While different human contexts can\nbe inferred from the fusion of different sensor modalities that can correlate\nto a particular mental state, the human brain provides a richer sensor modality\nthat gives us more insights into the required human context. This paper\nproposes ERUDITE, a human-in-the-loop IoT system for the learning environment\nthat exploits recent wearable neurotechnology to decode brain signals. Through\ninsights from concept learning theory, ERUDITE can infer the human state of\nlearning and understand when human learning increases or declines. By\nquantifying human learning as an input sensory signal, ERUDITE can provide\nadequate personalized feedback to humans in a learning environment to enhance\ntheir learning experience. ERUDITE is evaluated across $15$ participants and\nshowed that by using the brain signals as a sensor modality to infer the human\nlearning state and providing personalized adaptation to the learning\nenvironment, the participants' learning performance increased on average by\n$26\\%$. Furthermore, we showed that ERUDITE can be deployed on an edge-based\nprototype to evaluate its practicality and scalability.\n","authors":["Mojtaba Taherisadr","Mohammad Abdullah Al Faruque","Salma Elmalaki"],"pdf_url":"https://arxiv.org/pdf/2303.04292v2.pdf","comment":"It is under review in the IEEE IoT journal"},{"id":"http://arxiv.org/abs/2311.11965v1","updated":"2023-11-20T17:44:40Z","published":"2023-11-20T17:44:40Z","title":"Provably Efficient CVaR RL in Low-rank MDPs","summary":" We study risk-sensitive Reinforcement Learning (RL), where we aim to maximize\nthe Conditional Value at Risk (CVaR) with a fixed risk tolerance $\\tau$. Prior\ntheoretical work studying risk-sensitive RL focuses on the tabular Markov\nDecision Processes (MDPs) setting. To extend CVaR RL to settings where state\nspace is large, function approximation must be deployed. We study CVaR RL in\nlow-rank MDPs with nonlinear function approximation. Low-rank MDPs assume the\nunderlying transition kernel admits a low-rank decomposition, but unlike prior\nlinear models, low-rank MDPs do not assume the feature or state-action\nrepresentation is known. We propose a novel Upper Confidence Bound (UCB)\nbonus-driven algorithm to carefully balance the interplay between exploration,\nexploitation, and representation learning in CVaR RL. We prove that our\nalgorithm achieves a sample complexity of $\\tilde{O}\\left(\\frac{H^7 A^2\nd^4}{\\tau^2 \\epsilon^2}\\right)$ to yield an $\\epsilon$-optimal CVaR, where $H$\nis the length of each episode, $A$ is the capacity of action space, and $d$ is\nthe dimension of representations. Computational-wise, we design a novel\ndiscretized Least-Squares Value Iteration (LSVI) algorithm for the CVaR\nobjective as the planning oracle and show that we can find the near-optimal\npolicy in a polynomial running time with a Maximum Likelihood Estimation\noracle. To our knowledge, this is the first provably efficient CVaR RL\nalgorithm in low-rank MDPs.\n","authors":["Yulai Zhao","Wenhao Zhan","Xiaoyan Hu","Ho-fung Leung","Farzan Farnia","Wen Sun","Jason D. Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11965v1.pdf","comment":"The first three authors contribute equally and are ordered randomly"},{"id":"http://arxiv.org/abs/2311.11963v1","updated":"2023-11-20T17:43:09Z","published":"2023-11-20T17:43:09Z","title":"What Can AutoML Do For Continual Learning?","summary":" This position paper outlines the potential of AutoML for incremental\n(continual) learning to encourage more research in this direction. Incremental\nlearning involves incorporating new data from a stream of tasks and\ndistributions to learn enhanced deep representations and adapt better to new\ntasks. However, a significant limitation of incremental learners is that most\ncurrent techniques freeze the backbone architecture, hyperparameters, and the\norder & structure of the learning tasks throughout the learning and adaptation\nprocess. We strongly believe that AutoML offers promising solutions to address\nthese limitations, enabling incremental learning to adapt to more diverse\nreal-world tasks. Therefore, instead of directly proposing a new method, this\npaper takes a step back by posing the question: \"What can AutoML do for\nincremental learning?\" We outline three key areas of research that can\ncontribute to making incremental learners more dynamic, highlighting concrete\nopportunities to apply AutoML methods in novel ways as well as entirely new\nchallenges for AutoML research.\n","authors":["Mert Kilickaya","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2311.11963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.02249v2","updated":"2023-11-20T17:40:06Z","published":"2022-07-05T18:23:20Z","title":"Learning Task Embeddings for Teamwork Adaptation in Multi-Agent\n Reinforcement Learning","summary":" Successful deployment of multi-agent reinforcement learning often requires\nagents to adapt their behaviour. In this work, we discuss the problem of\nteamwork adaptation in which a team of agents needs to adapt their policies to\nsolve novel tasks with limited fine-tuning. Motivated by the intuition that\nagents need to be able to identify and distinguish tasks in order to adapt\ntheir behaviour to the current task, we propose to learn multi-agent task\nembeddings (MATE). These task embeddings are trained using an encoder-decoder\narchitecture optimised for reconstruction of the transition and reward\nfunctions which uniquely identify tasks. We show that a team of agents is able\nto adapt to novel tasks when provided with task embeddings. We propose three\nMATE training paradigms: independent MATE, centralised MATE, and mixed MATE\nwhich vary in the information used for the task encoding. We show that the\nembeddings learned by MATE identify tasks and provide useful information which\nagents leverage during adaptation to novel tasks.\n","authors":["Lukas Schäfer","Filippos Christianos","Amos Storkey","Stefano V. Albrecht"],"pdf_url":"https://arxiv.org/pdf/2207.02249v2.pdf","comment":"To be presented at the Seventh Workshop on Generalization in Planning\n at the NeurIPS 2023 conference"},{"id":"http://arxiv.org/abs/2311.11961v1","updated":"2023-11-20T17:38:35Z","published":"2023-11-20T17:38:35Z","title":"NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly\n Generation","summary":" Anomaly detection (AD) is essential in identifying rare and often critical\nevents in complex systems, finding applications in fields such as network\nintrusion detection, financial fraud detection, and fault detection in\ninfrastructure and industrial systems. While AD is typically treated as an\nunsupervised learning task due to the high cost of label annotation, it is more\npractical to assume access to a small set of labeled anomaly samples from\ndomain experts, as is the case for semi-supervised anomaly detection.\nSemi-supervised and supervised approaches can leverage such labeled data,\nresulting in improved performance. In this paper, rather than proposing a new\nsemi-supervised or supervised approach for AD, we introduce a novel algorithm\nfor generating additional pseudo-anomalies on the basis of the limited labeled\nanomalies and a large volume of unlabeled data. This serves as an augmentation\nto facilitate the detection of new anomalies. Our proposed algorithm, named\nNearest Neighbor Gaussian Mixup (NNG-Mix), efficiently integrates information\nfrom both labeled and unlabeled data to generate pseudo-anomalies. We compare\nthe performance of this novel algorithm with commonly applied augmentation\ntechniques, such as Mixup and Cutout. We evaluate NNG-Mix by training various\nexisting semi-supervised and supervised anomaly detection algorithms on the\noriginal training data along with the generated pseudo-anomalies. Through\nextensive experiments on 57 benchmark datasets in ADBench, reflecting different\ndata types, we demonstrate that NNG-Mix outperforms other data augmentation\nmethods. It yields significant performance improvements compared to the\nbaselines trained exclusively on the original training data. Notably, NNG-Mix\nyields up to 16.4%, 8.8%, and 8.0% improvements on Classical, CV, and NLP\ndatasets in ADBench. Our source code will be available at\nhttps://github.com/donghao51/NNG-Mix.\n","authors":["Hao Dong","Gaëtan Frusque","Yue Zhao","Eleni Chatzi","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2311.11961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11959v1","updated":"2023-11-20T17:35:44Z","published":"2023-11-20T17:35:44Z","title":"Correlated Attention in Transformers for Multivariate Time Series","summary":" Multivariate time series (MTS) analysis prevails in real-world applications\nsuch as finance, climate science and healthcare. The various self-attention\nmechanisms, the backbone of the state-of-the-art Transformer-based models,\nefficiently discover the temporal dependencies, yet cannot well capture the\nintricate cross-correlation between different features of MTS data, which\ninherently stems from complex dynamical systems in practice. To this end, we\npropose a novel correlated attention mechanism, which not only efficiently\ncaptures feature-wise dependencies, but can also be seamlessly integrated\nwithin the encoder blocks of existing well-known Transformers to gain\nefficiency improvement. In particular, correlated attention operates across\nfeature channels to compute cross-covariance matrices between queries and keys\nwith different lag values, and selectively aggregate representations at the\nsub-series level. This architecture facilitates automated discovery and\nrepresentation learning of not only instantaneous but also lagged\ncross-correlations, while inherently capturing time series auto-correlation.\nWhen combined with prevalent Transformer baselines, correlated attention\nmechanism constitutes a better alternative for encoder-only architectures,\nwhich are suitable for a wide range of tasks including imputation, anomaly\ndetection and classification. Extensive experiments on the aforementioned tasks\nconsistently underscore the advantages of correlated attention mechanism in\nenhancing base Transformer models, and demonstrate our state-of-the-art results\nin imputation, anomaly detection and classification.\n","authors":["Quang Minh Nguyen","Lam M. Nguyen","Subhro Das"],"pdf_url":"https://arxiv.org/pdf/2311.11959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17113v2","updated":"2023-11-20T17:31:20Z","published":"2023-09-29T10:12:30Z","title":"Meta-Path Learning for Multi-relational Graph Neural Networks","summary":" Existing multi-relational graph neural networks use one of two strategies for\nidentifying informative relations: either they reduce this problem to low-level\nweight learning, or they rely on handcrafted chains of relational dependencies,\ncalled meta-paths. However, the former approach faces challenges in the\npresence of many relations (e.g., knowledge graphs), while the latter requires\nsubstantial domain expertise to identify relevant meta-paths. In this work we\npropose a novel approach to learn meta-paths and meta-path GNNs that are highly\naccurate based on a small number of informative meta-paths. Key element of our\napproach is a scoring function for measuring the potential informativeness of a\nrelation in the incremental construction of the meta-path. Our experimental\nevaluation shows that the approach manages to correctly identify relevant\nmeta-paths even with a large number of relations, and substantially outperforms\nexisting multi-relational GNNs on synthetic and real-world experiments.\n","authors":["Francesco Ferrini","Antonio Longa","Andrea Passerini","Manfred Jaeger"],"pdf_url":"https://arxiv.org/pdf/2309.17113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11934v1","updated":"2023-11-20T17:18:21Z","published":"2023-11-20T17:18:21Z","title":"Estimation of entropy-regularized optimal transport maps between\n non-compactly supported measures","summary":" This paper addresses the problem of estimating entropy-regularized optimal\ntransport (EOT) maps with squared-Euclidean cost between source and target\nmeasures that are subGaussian. In the case that the target measure is compactly\nsupported or strongly log-concave, we show that for a recently proposed\nin-sample estimator, the expected squared $L^2$-error decays at least as fast\nas $O(n^{-1/3})$ where $n$ is the sample size. For the general subGaussian case\nwe show that the expected $L^1$-error decays at least as fast as $O(n^{-1/6})$,\nand in both cases we have polynomial dependence on the regularization\nparameter. While these results are suboptimal compared to known results in the\ncase of compactness of both the source and target measures (squared $L^2$-error\nconverging at a rate $O(n^{-1})$) and for when the source is subGaussian while\nthe target is compactly supported (squared $L^2$-error converging at a rate\n$O(n^{-1/2})$), their importance lie in eliminating the compact support\nrequirements. The proof technique makes use of a bias-variance decomposition\nwhere the variance is controlled using standard concentration of measure\nresults and the bias is handled by T1-transport inequalities along with sample\ncomplexity results in estimation of EOT cost under subGaussian assumptions. Our\nexperimental results point to a looseness in controlling the variance terms and\nwe conclude by posing several open problems.\n","authors":["Matthew Werenski","James M. Murphy","Shuchin Aeron"],"pdf_url":"https://arxiv.org/pdf/2311.11934v1.pdf","comment":"30 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.11932v1","updated":"2023-11-20T17:17:29Z","published":"2023-11-20T17:17:29Z","title":"Ovarian Cancer Data Analysis using Deep Learning: A Systematic Review\n from the Perspectives of Key Features of Data Analysis and AI Assurance","summary":" Background and objectives: By extracting this information, Machine or Deep\nLearning (ML/DL)-based autonomous data analysis tools can assist clinicians and\ncancer researchers in discovering patterns and relationships from complex data\nsets. Many DL-based analyses on ovarian cancer (OC) data have recently been\npublished. These analyses are highly diverse in various aspects of cancer\n(e.g., subdomain(s) and cancer type they address) and data analysis features.\nHowever, a comprehensive understanding of these analyses in terms of these\nfeatures and AI assurance (AIA) is currently lacking. This systematic review\naims to fill this gap by examining the existing literature and identifying\nimportant aspects of OC data analysis using DL, explicitly focusing on the key\nfeatures and AI assurance perspectives. Methods: The PRISMA framework was used\nto conduct comprehensive searches in three journal databases. Only studies\npublished between 2015 and 2023 in peer-reviewed journals were included in the\nanalysis. Results: In the review, a total of 96 DL-driven analyses were\nexamined. The findings reveal several important insights regarding DL-driven\novarian cancer data analysis: - Most studies 71% (68 out of 96) focused on\ndetection and diagnosis, while no study addressed the prediction and prevention\nof OC. - The analyses were predominantly based on samples from a non-diverse\npopulation (75% (72/96 studies)), limited to a geographic location or country.\n- Only a small proportion of studies (only 33% (32/96)) performed integrated\nanalyses, most of which used homogeneous data (clinical or omics). - Notably, a\nmere 8.3% (8/96) of the studies validated their models using external and\ndiverse data sets, highlighting the need for enhanced model validation, and -\nThe inclusion of AIA in cancer data analysis is in a very early stage; only\n2.1% (2/96) explicitly addressed AIA through explainability.\n","authors":["Muta Tah Hira","Mohammad A. Razzaque","Mosharraf Sarker"],"pdf_url":"https://arxiv.org/pdf/2311.11932v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08176v4","updated":"2023-11-20T17:10:22Z","published":"2023-10-12T10:01:39Z","title":"Infinite Width Graph Neural Networks for Node Regression/ Classification","summary":" This work analyzes Graph Neural Networks, a generalization of Fully-Connected\nDeep Neural Nets on Graph structured data, when their width, that is the number\nof nodes in each fullyconnected layer is increasing to infinity. Infinite Width\nNeural Networks are connecting Deep Learning to Gaussian Processes and Kernels,\nboth Machine Learning Frameworks with long traditions and extensive theoretical\nfoundations. Gaussian Processes and Kernels have much less hyperparameters then\nNeural Networks and can be used for uncertainty estimation, making them more\nuser friendly for applications. This works extends the increasing amount of\nresearch connecting Gaussian Processes and Kernels to Neural Networks. The\nKernel and Gaussian Process closed forms are derived for a variety of\narchitectures, namely the standard Graph Neural Network, the Graph Neural\nNetwork with Skip-Concatenate Connections and the Graph Attention Neural\nNetwork. All architectures are evaluated on a variety of datasets on the task\nof transductive Node Regression and Classification. Additionally, a Spectral\nSparsification method known as Effective Resistance is used to improve runtime\nand memory requirements. Extending the setting to inductive graph learning\ntasks (Graph Regression/ Classification) is straightforward and is briefly\ndiscussed in 3.5.\n","authors":["Yunus Cobanoglu"],"pdf_url":"https://arxiv.org/pdf/2310.08176v4.pdf","comment":"49 Pages, 2 Figures (with subfigures), multiple tables, v2: made\n table of contents fit to one page and added derivatives on GAT*NTK and GAT*GP\n in A.4, v3: shorten parts of introduction and fixed typos, added numberings\n to equations and discussion section, v4: fix two missing citations on page 10"},{"id":"http://arxiv.org/abs/2305.17010v3","updated":"2023-11-20T16:57:12Z","published":"2023-05-26T15:13:09Z","title":"Let the Flows Tell: Solving Graph Combinatorial Optimization Problems\n with GFlowNets","summary":" Combinatorial optimization (CO) problems are often NP-hard and thus out of\nreach for exact algorithms, making them a tempting domain to apply machine\nlearning methods. The highly structured constraints in these problems can\nhinder either optimization or sampling directly in the solution space. On the\nother hand, GFlowNets have recently emerged as a powerful machinery to\nefficiently sample from composite unnormalized densities sequentially and have\nthe potential to amortize such solution-searching processes in CO, as well as\ngenerate diverse solution candidates. In this paper, we design Markov decision\nprocesses (MDPs) for different combinatorial problems and propose to train\nconditional GFlowNets to sample from the solution space. Efficient training\ntechniques are also developed to benefit long-range credit assignment. Through\nextensive experiments on a variety of different CO tasks with synthetic and\nrealistic data, we demonstrate that GFlowNet policies can efficiently find\nhigh-quality solutions. Our implementation is open-sourced at\nhttps://github.com/zdhNarsil/GFlowNet-CombOpt.\n","authors":["Dinghuai Zhang","Hanjun Dai","Nikolay Malkin","Aaron Courville","Yoshua Bengio","Ling Pan"],"pdf_url":"https://arxiv.org/pdf/2305.17010v3.pdf","comment":"Accepted by NeurIPS 2023 as spotlight"},{"id":"http://arxiv.org/abs/2311.11913v1","updated":"2023-11-20T16:44:18Z","published":"2023-11-20T16:44:18Z","title":"Deep Calibration of Market Simulations using Neural Density Estimators\n and Embedding Networks","summary":" The ability to construct a realistic simulator of financial exchanges,\nincluding reproducing the dynamics of the limit order book, can give insight\ninto many counterfactual scenarios, such as a flash crash, a margin call, or\nchanges in macroeconomic outlook. In recent years, agent-based models have been\ndeveloped that reproduce many features of an exchange, as summarised by a set\nof stylised facts and statistics. However, the ability to calibrate simulators\nto a specific period of trading remains an open challenge. In this work, we\ndevelop a novel approach to the calibration of market simulators by leveraging\nrecent advances in deep learning, specifically using neural density estimators\nand embedding networks. We demonstrate that our approach is able to correctly\nidentify high probability parameter sets, both when applied to synthetic and\nhistorical data, and without reliance on manually selected or weighted\nensembles of stylised facts.\n","authors":["Namid R. Stillman","Rory Baggott","Justin Lyon","Jianfei Zhang","Dingqiu Zhu","Tao Chen","Perukrishnen Vytelingum"],"pdf_url":"https://arxiv.org/pdf/2311.11913v1.pdf","comment":"4th ACM International Conference on AI in Finance (ICAIF 2023)"},{"id":"http://arxiv.org/abs/2311.11911v1","updated":"2023-11-20T16:41:54Z","published":"2023-11-20T16:41:54Z","title":"Certification of Distributional Individual Fairness","summary":" Providing formal guarantees of algorithmic fairness is of paramount\nimportance to socially responsible deployment of machine learning algorithms.\nIn this work, we study formal guarantees, i.e., certificates, for individual\nfairness (IF) of neural networks. We start by introducing a novel convex\napproximation of IF constraints that exponentially decreases the computational\ncost of providing formal guarantees of local individual fairness. We highlight\nthat prior methods are constrained by their focus on global IF certification\nand can therefore only scale to models with a few dozen hidden neurons, thus\nlimiting their practical impact. We propose to certify distributional\nindividual fairness which ensures that for a given empirical distribution and\nall distributions within a $\\gamma$-Wasserstein ball, the neural network has\nguaranteed individually fair predictions. Leveraging developments in\nquasi-convex optimization, we provide novel and efficient certified bounds on\ndistributional individual fairness and show that our method allows us to\ncertify and regularize neural networks that are several orders of magnitude\nlarger than those considered by prior works. Moreover, we study real-world\ndistribution shifts and find our bounds to be a scalable, practical, and sound\nsource of IF guarantees.\n","authors":["Matthew Wicker","Vihari Piratia","Adrian Weller"],"pdf_url":"https://arxiv.org/pdf/2311.11911v1.pdf","comment":"21 Pages, Neural Information Processing Systems 2023"},{"id":"http://arxiv.org/abs/2311.11908v1","updated":"2023-11-20T16:40:29Z","published":"2023-11-20T16:40:29Z","title":"Continual Learning: Applications and the Road Forward","summary":" Continual learning is a sub-field of machine learning, which aims to allow\nmachine learning models to continuously learn on new data, by accumulating\nknowledge without forgetting what was learned in the past. In this work, we\ntake a step back, and ask: \"Why should one care about continual learning in the\nfirst place?\". We set the stage by surveying recent continual learning papers\npublished at three major machine learning conferences, and show that\nmemory-constrained settings dominate the field. Then, we discuss five open\nproblems in machine learning, and even though they seem unrelated to continual\nlearning at first sight, we show that continual learning will inevitably be\npart of their solution. These problems are model-editing, personalization,\non-device learning, faster (re-)training and reinforcement learning. Finally,\nby comparing the desiderata from these unsolved problems and the current\nassumptions in continual learning, we highlight and discuss four future\ndirections for continual learning research. We hope that this work offers an\ninteresting perspective on the future of continual learning, while displaying\nits potential value and the paths we have to pursue in order to make it\nsuccessful. This work is the result of the many discussions the authors had at\nthe Dagstuhl seminar on Deep Continual Learning, in March 2023.\n","authors":["Eli Verwimp","Shai Ben-David","Matthias Bethge","Andrea Cossu","Alexander Gepperth","Tyler L. Hayes","Eyke Hüllermeier","Christopher Kanan","Dhireesha Kudithipudi","Christoph H. Lampert","Martin Mundt","Razvan Pascanu","Adrian Popescu","Andreas S. Tolias","Joost van de Weijer","Bing Liu","Vincenzo Lomonaco","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.11908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11905v1","updated":"2023-11-20T16:38:45Z","published":"2023-11-20T16:38:45Z","title":"Real-Time Surface-to-Air Missile Engagement Zone Prediction Using\n Simulation and Machine Learning","summary":" Surface-to-Air Missiles (SAMs) are crucial in modern air defense systems. A\ncritical aspect of their effectiveness is the Engagement Zone (EZ), the spatial\nregion within which a SAM can effectively engage and neutralize a target.\nNotably, the EZ is intrinsically related to the missile's maximum range; it\ndefines the furthest distance at which a missile can intercept a target. The\naccurate computation of this EZ is essential but challenging due to the dynamic\nand complex factors involved, which often lead to high computational costs and\nextended processing times when using conventional simulation methods. In light\nof these challenges, our study investigates the potential of machine learning\ntechniques, proposing an approach that integrates machine learning with a\ncustom-designed simulation tool to train supervised algorithms. We leverage a\ncomprehensive dataset of pre-computed SAM EZ simulations, enabling our model to\naccurately predict the SAM EZ for new input parameters. It accelerates SAM EZ\nsimulations, enhances air defense strategic planning, and provides real-time\ninsights, improving SAM system performance. The study also includes a\ncomparative analysis of machine learning algorithms, illuminating their\ncapabilities and performance metrics and suggesting areas for future research,\nhighlighting the transformative potential of machine learning in SAM EZ\nsimulations.\n","authors":["Joao P. A. Dantas","Diego Geraldo","Felipe L. L. Medeiros","Marcos R. O. A. Maximo","Takashi Yoneyama"],"pdf_url":"https://arxiv.org/pdf/2311.11905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11904v1","updated":"2023-11-20T16:37:45Z","published":"2023-11-20T16:37:45Z","title":"LLMs as Visual Explainers: Advancing Image Classification with Evolving\n Visual Descriptions","summary":" Vision-language models (VLMs) offer a promising paradigm for image\nclassification by comparing the similarity between images and class embeddings.\nA critical challenge lies in crafting precise textual representations for class\nnames. While previous studies have leveraged recent advancements in large\nlanguage models (LLMs) to enhance these descriptors, their outputs often suffer\nfrom ambiguity and inaccuracy. We identify two primary causes: 1) The prevalent\nreliance on textual interactions with LLMs, leading to a mismatch between the\ngenerated text and the visual content in VLMs' latent space - a phenomenon we\nterm the \"explain without seeing\" dilemma. 2) The oversight of the inter-class\nrelationships, resulting in descriptors that fail to differentiate similar\nclasses effectively. To address these issues, we propose a novel image\nclassification framework combining VLMs with LLMs, named Iterative Optimization\nwith Visual Feedback. In particular, our method develops an LLM-based agent,\nemploying an evolutionary optimization strategy to refine class descriptors.\nCrucially, we incorporate visual feedback from VLM classification metrics,\nthereby guiding the optimization process with concrete visual data. Our method\nleads to improving accuracy on a wide range of image classification benchmarks,\nwith 3.47\\% average gains over state-of-the-art methods. We also highlight the\nresulting descriptions serve as explainable and robust features that can\nconsistently improve the performance across various backbone models.\n","authors":["Songhao Han","Le Zhuo","Yue Liao","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2311.11904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11900v1","updated":"2023-11-20T16:34:48Z","published":"2023-11-20T16:34:48Z","title":"Measuring and Mitigating Biases in Motor Insurance Pricing","summary":" The non-life insurance sector operates within a highly competitive and\ntightly regulated framework, confronting a pivotal juncture in the formulation\nof pricing strategies. Insurers are compelled to harness a range of statistical\nmethodologies and available data to construct optimal pricing structures that\nalign with the overarching corporate strategy while accommodating the dynamics\nof market competition. Given the fundamental societal role played by insurance,\npremium rates are subject to rigorous scrutiny by regulatory authorities. These\nrates must conform to principles of transparency, explainability, and ethical\nconsiderations. Consequently, the act of pricing transcends mere statistical\ncalculations and carries the weight of strategic and societal factors. These\nmultifaceted concerns may drive insurers to establish equitable premiums,\ntaking into account various variables. For instance, regulations mandate the\nprovision of equitable premiums, considering factors such as policyholder\ngender or mutualist group dynamics in accordance with respective corporate\nstrategies. Age-based premium fairness is also mandated. In certain insurance\ndomains, variables such as the presence of serious illnesses or disabilities\nare emerging as new dimensions for evaluating fairness. Regardless of the\nmotivating factor prompting an insurer to adopt fairer pricing strategies for a\nspecific variable, the insurer must possess the capability to define, measure,\nand ultimately mitigate any ethical biases inherent in its pricing practices\nwhile upholding standards of consistency and performance. This study seeks to\nprovide a comprehensive set of tools for these endeavors and assess their\neffectiveness through practical application in the context of automobile\ninsurance.\n","authors":["Mulah Moriah","Franck Vermet","Arthur Charpentier"],"pdf_url":"https://arxiv.org/pdf/2311.11900v1.pdf","comment":"37 pages"},{"id":"http://arxiv.org/abs/2311.11891v1","updated":"2023-11-20T16:24:23Z","published":"2023-11-20T16:24:23Z","title":"AMES: A Differentiable Embedding Space Selection Framework for Latent\n Graph Inference","summary":" In real-world scenarios, although data entities may possess inherent\nrelationships, the specific graph illustrating their connections might not be\ndirectly accessible. Latent graph inference addresses this issue by enabling\nGraph Neural Networks (GNNs) to operate on point cloud data, dynamically\nlearning the necessary graph structure. These graphs are often derived from a\nlatent embedding space, which can be modeled using Euclidean, hyperbolic,\nspherical, or product spaces. However, currently, there is no principled\ndifferentiable method for determining the optimal embedding space. In this\nwork, we introduce the Attentional Multi-Embedding Selection (AMES) framework,\na differentiable method for selecting the best embedding space for latent graph\ninference through backpropagation, considering a downstream task. Our framework\nconsistently achieves comparable or superior results compared to previous\nmethods for latent graph inference across five benchmark datasets. Importantly,\nour approach eliminates the need for conducting multiple experiments to\nidentify the optimal embedding space. Furthermore, we explore interpretability\ntechniques that track the gradient contributions of different latent graphs,\nshedding light on how our attention-based, fully differentiable approach learns\nto choose the appropriate latent space. In line with previous works, our\nexperiments emphasize the advantages of hyperbolic spaces in enhancing\nperformance. More importantly, our interpretability framework provides a\ngeneral approach for quantitatively comparing embedding spaces across different\ntasks based on their contributions, a dimension that has been overlooked in\nprevious literature on latent graph inference.\n","authors":["Yuan Lu","Haitz Sáez de Ocáriz Borde","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2311.11891v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11883v1","updated":"2023-11-20T16:20:13Z","published":"2023-11-20T16:20:13Z","title":"Efficient Neural Networks for Tiny Machine Learning: A Comprehensive\n Review","summary":" The field of Tiny Machine Learning (TinyML) has gained significant attention\ndue to its potential to enable intelligent applications on resource-constrained\ndevices. This review provides an in-depth analysis of the advancements in\nefficient neural networks and the deployment of deep learning models on\nultra-low power microcontrollers (MCUs) for TinyML applications. It begins by\nintroducing neural networks and discussing their architectures and resource\nrequirements. It then explores MEMS-based applications on ultra-low power MCUs,\nhighlighting their potential for enabling TinyML on resource-constrained\ndevices. The core of the review centres on efficient neural networks for\nTinyML. It covers techniques such as model compression, quantization, and\nlow-rank factorization, which optimize neural network architectures for minimal\nresource utilization on MCUs. The paper then delves into the deployment of deep\nlearning models on ultra-low power MCUs, addressing challenges such as limited\ncomputational capabilities and memory resources. Techniques like model pruning,\nhardware acceleration, and algorithm-architecture co-design are discussed as\nstrategies to enable efficient deployment. Lastly, the review provides an\noverview of current limitations in the field, including the trade-off between\nmodel complexity and resource constraints. Overall, this review paper presents\na comprehensive analysis of efficient neural networks and deployment strategies\nfor TinyML on ultra-low-power MCUs. It identifies future research directions\nfor unlocking the full potential of TinyML applications on resource-constrained\ndevices.\n","authors":["Minh Tri Lê","Pierre Wolinski","Julyan Arbel"],"pdf_url":"https://arxiv.org/pdf/2311.11883v1.pdf","comment":"39 pages, 9 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.11882v1","updated":"2023-11-20T16:19:46Z","published":"2023-11-20T16:19:46Z","title":"Multi-Task Faces (MTF) Data Set: A Legally and Ethically Compliant\n Collection of Face Images for Various Classification Tasks","summary":" Human facial data hold tremendous potential to address a variety of\nclassification problems, including face recognition, age estimation, gender\nidentification, emotion analysis, and race classification. However, recent\nprivacy regulations, such as the EU General Data Protection Regulation and\nothers, have restricted the ways in which human images may be collected and\nused for research. As a result, several previously published data sets\ncontaining human faces have been removed from the internet due to inadequate\ndata collection methods that failed to meet privacy regulations. Data sets\nconsisting of synthetic data have been proposed as an alternative, but they\nfall short of accurately representing the real data distribution. On the other\nhand, most available data sets are labeled for just a single task, which limits\ntheir applicability. To address these issues, we present the Multi-Task Faces\n(MTF) image data set, a meticulously curated collection of face images designed\nfor various classification tasks, including face recognition, as well as race,\ngender, and age classification. The MTF data set has been ethically gathered by\nleveraging publicly available images of celebrities and strictly adhering to\ncopyright regulations. In this paper, we present this data set and provide\ndetailed descriptions of the followed data collection and processing\nprocedures. Furthermore, we evaluate the performance of five deep learning (DL)\nmodels on the MTF data set across the aforementioned classification tasks.\nAdditionally, we compare the performance of DL models over the processed MTF\ndata and over raw data crawled from the internet. The reported results\nconstitute a baseline for further research employing these data. The MTF data\nset can be accessed through the following link (please cite the present paper\nif you use the data set): https://github.com/RamiHaf/MTF_data_set\n","authors":["Rami Haffar","David Sánchez","Josep Domingo-Ferrer"],"pdf_url":"https://arxiv.org/pdf/2311.11882v1.pdf","comment":"21 pages, 2 figures, 9 Tables,"},{"id":"http://arxiv.org/abs/2310.18288v3","updated":"2023-11-20T16:17:38Z","published":"2023-10-27T17:25:12Z","title":"Sustainable Concrete via Bayesian Optimization","summary":" Eight percent of global carbon dioxide emissions can be attributed to the\nproduction of cement, the main component of concrete, which is also the\ndominant source of CO2 emissions in the construction of data centers. The\ndiscovery of lower-carbon concrete formulae is therefore of high significance\nfor sustainability. However, experimenting with new concrete formulae is time\nconsuming and labor intensive, as one usually has to wait to record the\nconcrete's 28-day compressive strength, a quantity whose measurement can by its\ndefinition not be accelerated. This provides an opportunity for experimental\ndesign methodology like Bayesian Optimization (BO) to accelerate the search for\nstrong and sustainable concrete formulae. Herein, we 1) propose modeling steps\nthat make concrete strength amenable to be predicted accurately by a Gaussian\nprocess model with relatively few measurements, 2) formulate the search for\nsustainable concrete as a multi-objective optimization problem, and 3) leverage\nthe proposed model to carry out multi-objective BO with real-world strength\nmeasurements of the algorithmically proposed mixes. Our experimental results\nshow improved trade-offs between the mixtures' global warming potential (GWP)\nand their associated compressive strengths, compared to mixes based on current\nindustry practices. Our methods are open-sourced at\ngithub.com/facebookresearch/SustainableConcrete.\n","authors":["Sebastian Ament","Andrew Witte","Nishant Garg","Julius Kusuma"],"pdf_url":"https://arxiv.org/pdf/2310.18288v3.pdf","comment":"NeurIPS 2023 Workshop on Adaptive Experimental Design and Active\n Learning in the Real World"},{"id":"http://arxiv.org/abs/2311.11876v1","updated":"2023-11-20T16:12:34Z","published":"2023-11-20T16:12:34Z","title":"Forward Gradients for Data-Driven CFD Wall Modeling","summary":" Computational Fluid Dynamics (CFD) is used in the design and optimization of\ngas turbines and many other industrial/ scientific applications. However, the\npractical use is often limited by the high computational cost, and the accurate\nresolution of near-wall flow is a significant contributor to this cost. Machine\nlearning (ML) and other data-driven methods can complement existing wall\nmodels. Nevertheless, training these models is bottlenecked by the large\ncomputational effort and memory footprint demanded by back-propagation. Recent\nwork has presented alternatives for computing gradients of neural networks\nwhere a separate forward and backward sweep is not needed and storage of\nintermediate results between sweeps is not required because an unbiased\nestimator for the gradient is computed in a single forward sweep. In this\npaper, we discuss the application of this approach for training a subgrid wall\nmodel that could potentially be used as a surrogate in wall-bounded flow CFD\nsimulations to reduce the computational overhead while preserving predictive\naccuracy.\n","authors":["Jan Hückelheim","Tadbhagya Kumar","Krishnan Raghavan","Pinaki Pal"],"pdf_url":"https://arxiv.org/pdf/2311.11876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04741v3","updated":"2023-11-20T16:09:07Z","published":"2023-10-07T08:54:43Z","title":"Balancing stability and plasticity in continual learning: the\n readout-decomposition of activation change (RDAC) framework","summary":" Continual learning (CL) algorithms strive to acquire new knowledge while\npreserving prior information. However, this stability-plasticity trade-off\nremains a central challenge. This paper introduces a framework that dissects\nthis trade-off, offering valuable insights into CL algorithms. The\nReadout-Decomposition of Activation Change (RDAC) framework first addresses the\nstability-plasticity dilemma and its relation to catastrophic forgetting. It\nrelates learning-induced activation changes in the range of prior readouts to\nthe degree of stability and changes in the null space to the degree of\nplasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the\nframework clarifies the stability-plasticity trade-offs of the popular\nregularization algorithms Synaptic intelligence (SI), Elastic-weight\nconsolidation (EWC), and learning without Forgetting (LwF), and replay-based\nalgorithms Gradient episodic memory (GEM), and data replay. GEM and data replay\npreserved stability and plasticity, while SI, EWC, and LwF traded off\nplasticity for stability. The inability of the regularization algorithms to\nmaintain plasticity was linked to them restricting the change of activations in\nthe null space of the prior readout. Additionally, for one-hidden-layer linear\nneural networks, we derived a gradient decomposition algorithm to restrict\nactivation change only in the range of the prior readouts, to maintain high\nstability while not further sacrificing plasticity. Results demonstrate that\nthe algorithm maintained stability without significant plasticity loss. The\nRDAC framework informs the behavior of existing CL algorithms and paves the way\nfor novel CL approaches. Finally, it sheds light on the connection between\nlearning-induced activation/representation changes and the stability-plasticity\ndilemma, also offering insights into representational drift in biological\nsystems.\n","authors":["Daniel Anthes","Sushrut Thorat","Peter König","Tim C. Kietzmann"],"pdf_url":"https://arxiv.org/pdf/2310.04741v3.pdf","comment":"15 pages, 5 figures, Revision"},{"id":"http://arxiv.org/abs/2311.11871v1","updated":"2023-11-20T16:06:35Z","published":"2023-11-20T16:06:35Z","title":"Training robust and generalizable quantum models","summary":" Adversarial robustness and generalization are both crucial properties of\nreliable machine learning models. In this paper, we study these properties in\nthe context of quantum machine learning based on Lipschitz bounds. We derive\ntailored, parameter-dependent Lipschitz bounds for quantum models with\ntrainable encoding, showing that the norm of the data encoding has a crucial\nimpact on the robustness against perturbations in the input data. Further, we\nderive a bound on the generalization error which explicitly depends on the\nparameters of the data encoding. Our theoretical findings give rise to a\npractical strategy for training robust and generalizable quantum models by\nregularizing the Lipschitz bound in the cost. Further, we show that, for fixed\nand non-trainable encodings as frequently employed in quantum machine learning,\nthe Lipschitz bound cannot be influenced by tuning the parameters. Thus,\ntrainable encodings are crucial for systematically adapting robustness and\ngeneralization during training. With numerical results, we demonstrate that,\nindeed, Lipschitz bound regularization leads to substantially more robust and\ngeneralizable quantum models.\n","authors":["Julian Berberich","Daniel Fink","Daniel Pranjić","Christian Tutschku","Christian Holm"],"pdf_url":"https://arxiv.org/pdf/2311.11871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11862v1","updated":"2023-11-20T15:57:49Z","published":"2023-11-20T15:57:49Z","title":"Establishing Central Sensitization Inventory Cut-off Values in patients\n with Chronic Low Back Pain by Unsupervised Machine Learning","summary":" Human Assumed Central Sensitization is involved in the development and\nmaintenance of chronic low back pain (CLBP). The Central Sensitization\nInventory (CSI) was developed to evaluate the presence of HACS, with a cut-off\nvalue of 40/100 based on patients with chronic pain. However, various factors\nincluding pain conditions (e.g., CLBP), and gender may influence this cut-off\nvalue. For chronic pain condition such as CLBP, unsupervised clustering\napproaches can take these factors into consideration and automatically learn\nthe HACS-related patterns. Therefore, this study aimed to determine the cut-off\nvalues for a Dutch-speaking population with CLBP, considering the total group\nand stratified by gender based on unsupervised machine learning. In this study,\nquestionnaire data covering pain, physical, and psychological aspects were\ncollected from patients with CLBP and aged-matched pain-free adults (referred\nto as healthy controls, HC). Four clustering approaches were applied to\nidentify HACS-related clusters based on the questionnaire data and gender. The\nclustering performance was assessed using internal and external indicators.\nSubsequently, receiver operating characteristic analysis was conducted on the\nbest clustering results to determine the optimal cut-off values. The study\nincluded 151 subjects, consisting of 63 HCs and 88 patients with CLBP.\nHierarchical clustering yielded the best results, identifying three clusters:\nhealthy group, CLBP with low HACS level, and CLBP with high HACS level groups.\nBased on the low HACS levels group (including HC and CLBP with low HACS level)\nand high HACS level group, the cut-off value for the overall groups were 35, 34\nfor females, and 35 for. The findings suggest that the optimal cut-off values\nfor CLBP is 35. The gender-related cut-off values should be interpreted with\ncaution due to the unbalanced gender distribution in the sample.\n","authors":["Xiaoping Zheng","Claudine JC Lamoth","Hans Timmerman","Ebert Otten","Michiel F Reneman"],"pdf_url":"https://arxiv.org/pdf/2311.11862v1.pdf","comment":"31 pages, 5 tables, 3 figures"},{"id":"http://arxiv.org/abs/2311.10090v3","updated":"2023-11-20T15:51:07Z","published":"2023-11-16T18:58:43Z","title":"JaxMARL: Multi-Agent RL Environments in JAX","summary":" Benchmarks play an important role in the development of machine learning\nalgorithms. For example, research in reinforcement learning (RL) has been\nheavily influenced by available environments and benchmarks. However, RL\nenvironments are traditionally run on the CPU, limiting their scalability with\ntypical academic compute. Recent advancements in JAX have enabled the wider use\nof hardware acceleration to overcome these computational hurdles, enabling\nmassively parallel RL training pipelines and environments. This is particularly\nuseful for multi-agent reinforcement learning (MARL) research. First of all,\nmultiple agents must be considered at each environment step, adding\ncomputational burden, and secondly, the sample complexity is increased due to\nnon-stationarity, decentralised partial observability, or other MARL\nchallenges. In this paper, we present JaxMARL, the first open-source code base\nthat combines ease-of-use with GPU enabled efficiency, and supports a large\nnumber of commonly used MARL environments as well as popular baseline\nalgorithms. When considering wall clock time, our experiments show that per-run\nour JAX-based training pipeline is up to 12500x faster than existing\napproaches. This enables efficient and thorough evaluations, with the potential\nto alleviate the evaluation crisis of the field. We also introduce and\nbenchmark SMAX, a vectorised, simplified version of the popular StarCraft\nMulti-Agent Challenge, which removes the need to run the StarCraft II game\nengine. This not only enables GPU acceleration, but also provides a more\nflexible MARL environment, unlocking the potential for self-play,\nmeta-learning, and other future applications in MARL. We provide code at\nhttps://github.com/flairox/jaxmarl.\n","authors":["Alexander Rutherford","Benjamin Ellis","Matteo Gallici","Jonathan Cook","Andrei Lupu","Gardar Ingvarsson","Timon Willi","Akbir Khan","Christian Schroeder de Witt","Alexandra Souly","Saptarashmi Bandyopadhyay","Mikayel Samvelyan","Minqi Jiang","Robert Tjarko Lange","Shimon Whiteson","Bruno Lacerda","Nick Hawes","Tim Rocktaschel","Chris Lu","Jakob Nicolaus Foerster"],"pdf_url":"https://arxiv.org/pdf/2311.10090v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11847v1","updated":"2023-11-20T15:37:39Z","published":"2023-11-20T15:37:39Z","title":"Deep learning complete intersection Calabi-Yau manifolds","summary":" We review advancements in deep learning techniques for complete intersection\nCalabi-Yau (CICY) 3- and 4-folds, with the aim of understanding better how to\nhandle algebraic topological data with machine learning. We first discuss\nmethodological aspects and data analysis, before describing neural networks\narchitectures. Then, we describe the state-of-the art accuracy in predicting\nHodge numbers. We include new results on extrapolating predictions from low to\nhigh Hodge numbers, and conversely.\n","authors":["Harold Erbin","Riccardo Finotello"],"pdf_url":"https://arxiv.org/pdf/2311.11847v1.pdf","comment":"19 pages; match version published in \"Machine Learning in Pure\n Mathematics and Theoretical Physics\" (edited by Y.-H. He, World Scientific\n Press)"},{"id":"http://arxiv.org/abs/2305.02441v2","updated":"2023-11-20T15:27:37Z","published":"2023-05-03T22:01:10Z","title":"Reward Teaching for Federated Multi-armed Bandits","summary":" Most of the existing federated multi-armed bandits (FMAB) designs are based\non the presumption that clients will implement the specified design to\ncollaborate with the server. In reality, however, it may not be possible to\nmodify the clients' existing protocols. To address this challenge, this work\nfocuses on clients who always maximize their individual cumulative rewards, and\nintroduces a novel idea of ``reward teaching'', where the server guides the\nclients towards global optimality through implicit local reward adjustments.\nUnder this framework, the server faces two tightly coupled tasks of bandit\nlearning and target teaching, whose combination is non-trivial and challenging.\nA phased approach, called Teaching-After-Learning (TAL), is first designed to\nencourage and discourage clients' explorations separately. General performance\nanalyses of TAL are established when the clients' strategies satisfy certain\nmild requirements. With novel technical approaches developed to analyze the\nwarm-start behaviors of bandit algorithms, particularized guarantees of TAL\nwith clients running UCB or epsilon-greedy strategies are then obtained. These\nresults demonstrate that TAL achieves logarithmic regrets while only incurring\nlogarithmic adjustment costs, which is order-optimal w.r.t. a natural lower\nbound. As a further extension, the Teaching-While-Learning (TWL) algorithm is\ndeveloped with the idea of successive arm elimination to break the non-adaptive\nphase separation in TAL. Rigorous analyses demonstrate that when facing clients\nwith UCB1, TWL outperforms TAL in terms of the dependencies on sub-optimality\ngaps thanks to its adaptive design. Experimental results demonstrate the\neffectiveness and generality of the proposed algorithms.\n","authors":["Chengshuai Shi","Wei Xiong","Cong Shen","Jing Yang"],"pdf_url":"https://arxiv.org/pdf/2305.02441v2.pdf","comment":"Accepted to IEEE Transactions on Signal Processing"},{"id":"http://arxiv.org/abs/2311.11841v1","updated":"2023-11-20T15:17:20Z","published":"2023-11-20T15:17:20Z","title":"High Probability Guarantees for Random Reshuffling","summary":" We consider the stochastic gradient method with random reshuffling\n($\\mathsf{RR}$) for tackling smooth nonconvex optimization problems.\n$\\mathsf{RR}$ finds broad applications in practice, notably in training neural\nnetworks. In this work, we first investigate the concentration property of\n$\\mathsf{RR}$'s sampling procedure and establish a new high probability sample\ncomplexity guarantee for driving the gradient (without expectation) below\n$\\varepsilon$, which effectively characterizes the efficiency of a single\n$\\mathsf{RR}$ execution. Our derived complexity matches the best existing\nin-expectation one up to a logarithmic term while imposing no additional\nassumptions nor changing $\\mathsf{RR}$'s updating rule. Furthermore, by\nleveraging our derived high probability descent property and bound on the\nstochastic error, we propose a simple and computable stopping criterion for\n$\\mathsf{RR}$ (denoted as $\\mathsf{RR}$-$\\mathsf{sc}$). This criterion is\nguaranteed to be triggered after a finite number of iterations, and then\n$\\mathsf{RR}$-$\\mathsf{sc}$ returns an iterate with its gradient below\n$\\varepsilon$ with high probability. Moreover, building on the proposed\nstopping criterion, we design a perturbed random reshuffling method\n($\\mathsf{p}$-$\\mathsf{RR}$) that involves an additional randomized\nperturbation procedure near stationary points. We derive that\n$\\mathsf{p}$-$\\mathsf{RR}$ provably escapes strict saddle points and\nefficiently returns a second-order stationary point with high probability,\nwithout making any sub-Gaussian tail-type assumptions on the stochastic\ngradient errors. Finally, we conduct numerical experiments on neural network\ntraining to support our theoretical findings.\n","authors":["Hengxu Yu","Xiao Li"],"pdf_url":"https://arxiv.org/pdf/2311.11841v1.pdf","comment":"21 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.20049v3","updated":"2023-11-20T15:16:59Z","published":"2023-10-30T22:12:35Z","title":"SURF: A Generalization Benchmark for GNNs Predicting Fluid Dynamics","summary":" Simulating fluid dynamics is crucial for the design and development process,\nranging from simple valves to complex turbomachinery. Accurately solving the\nunderlying physical equations is computationally expensive. Therefore,\nlearning-based solvers that model interactions on meshes have gained interest\ndue to their promising speed-ups. However, it is unknown to what extent these\nmodels truly understand the underlying physical principles and can generalize\nrather than interpolate. Generalization is a key requirement for a\ngeneral-purpose fluid simulator, which should adapt to different topologies,\nresolutions, or thermodynamic ranges. We propose SURF, a benchmark designed to\ntest the $\\textit{generalization}$ of learned graph-based fluid simulators.\nSURF comprises individual datasets and provides specific performance and\ngeneralization metrics for evaluating and comparing different models. We\nempirically demonstrate the applicability of SURF by thoroughly investigating\nthe two state-of-the-art graph-based models, yielding new insights into their\ngeneralization.\n","authors":["Stefan Künzli","Florian Grötschla","Joël Mathys","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2310.20049v3.pdf","comment":"Accepted at LoG 2023, Learning on Graphs Conference"},{"id":"http://arxiv.org/abs/2311.11837v1","updated":"2023-11-20T15:11:31Z","published":"2023-11-20T15:11:31Z","title":"Kandinsky Conformal Prediction: Efficient Calibration of Image\n Segmentation Algorithms","summary":" Image segmentation algorithms can be understood as a collection of pixel\nclassifiers, for which the outcomes of nearby pixels are correlated. Classifier\nmodels can be calibrated using Inductive Conformal Prediction, but this\nrequires holding back a sufficiently large calibration dataset for computing\nthe distribution of non-conformity scores of the model's predictions. If one\nonly requires only marginal calibration on the image level, this calibration\nset consists of all individual pixels in the images available for calibration.\nHowever, if the goal is to attain proper calibration for each individual pixel\nclassifier, the calibration set consists of individual images. In a scenario\nwhere data are scarce (such as the medical domain), it may not always be\npossible to set aside sufficiently many images for this pixel-level\ncalibration. The method we propose, dubbed ``Kandinsky calibration'', makes use\nof the spatial structure present in the distribution of natural images to\nsimultaneously calibrate the classifiers of ``similar'' pixels. This can be\nseen as an intermediate approach between marginal (imagewise) and conditional\n(pixelwise) calibration, where non-conformity scores are aggregated over\nsimilar image regions, thereby making more efficient use of the images\navailable for calibration. We run experiments on segmentation algorithms\ntrained and calibrated on subsets of the public MS-COCO and Medical Decathlon\ndatasets, demonstrating that Kandinsky calibration method can significantly\nimprove the coverage. When compared to both pixelwise and imagewise calibration\non little data, the Kandinsky method achieves much lower coverage errors,\nindicating the data efficiency of the Kandinsky calibration.\n","authors":["Joren Brunekreef","Eric Marcus","Ray Sheombarsing","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2311.11837v1.pdf","comment":"15 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.08427v2","updated":"2023-11-20T15:05:59Z","published":"2023-11-13T13:23:31Z","title":"Towards a Transportable Causal Network Model Based on Observational\n Healthcare Data","summary":" Over the last decades, many prognostic models based on artificial\nintelligence techniques have been used to provide detailed predictions in\nhealthcare. Unfortunately, the real-world observational data used to train and\nvalidate these models are almost always affected by biases that can strongly\nimpact the outcomes validity: two examples are values missing not-at-random and\nselection bias. Addressing them is a key element in achieving transportability\nand in studying the causal relationships that are critical in clinical decision\nmaking, going beyond simpler statistical approaches based on probabilistic\nassociation.\n In this context, we propose a novel approach that combines selection\ndiagrams, missingness graphs, causal discovery and prior knowledge into a\nsingle graphical model to estimate the cardiovascular risk of adolescent and\nyoung females who survived breast cancer. We learn this model from data\ncomprising two different cohorts of patients. The resulting causal network\nmodel is validated by expert clinicians in terms of risk assessment, accuracy\nand explainability, and provides a prognostic model that outperforms competing\nmachine learning methods.\n","authors":["Alice Bernasconi","Alessio Zanga","Peter J. F. Lucas","Marco Scutari","Fabio Stella"],"pdf_url":"https://arxiv.org/pdf/2311.08427v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11829v1","updated":"2023-11-20T15:04:50Z","published":"2023-11-20T15:04:50Z","title":"System 2 Attention (is something you might need too)","summary":" Soft attention in Transformer-based Large Language Models (LLMs) is\nsusceptible to incorporating irrelevant information from the context into its\nlatent representations, which adversely affects next token generations. To help\nrectify these issues, we introduce System 2 Attention (S2A), which leverages\nthe ability of LLMs to reason in natural language and follow instructions in\norder to decide what to attend to. S2A regenerates the input context to only\ninclude the relevant portions, before attending to the regenerated context to\nelicit the final response. In experiments, S2A outperforms standard\nattention-based LLMs on three tasks containing opinion or irrelevant\ninformation, QA, math word problems and longform generation, where S2A\nincreases factuality and objectivity, and decreases sycophancy.\n","authors":["Jason Weston","Sainbayar Sukhbaatar"],"pdf_url":"https://arxiv.org/pdf/2311.11829v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11827v1","updated":"2023-11-20T15:04:16Z","published":"2023-11-20T15:04:16Z","title":"Few-shot Multispectral Segmentation with Representations Generated by\n Reinforcement Learning","summary":" The task of multispectral image segmentation (segmentation of images with\nnumerous channels/bands, each capturing a specific range of wavelengths of\nelectromagnetic radiation) has been previously explored in contexts with large\namounts of labeled data. However, these models tend not to generalize well to\ndatasets of smaller size. In this paper, we propose a novel approach for\nimproving few-shot segmentation performance on multispectral images using\nreinforcement learning to generate representations. These representations are\ngenerated in the form of mathematical expressions between channels and are\ntailored to the specific class being segmented. Our methodology involves\ntraining an agent to identify the most informative expressions, updating the\ndataset using these expressions, and then using the updated dataset to perform\nsegmentation. Due to the limited length of the expressions, the model receives\nuseful representations without any added risk of overfitting. We evaluate the\neffectiveness of our approach on several multispectral datasets and demonstrate\nits effectiveness in boosting the performance of segmentation algorithms.\n","authors":["Dilith Jayakody","Thanuja Ambegoda"],"pdf_url":"https://arxiv.org/pdf/2311.11827v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2308.15283v2","updated":"2023-11-20T15:03:58Z","published":"2023-08-29T13:14:53Z","title":"Structural Node Embeddings with Homomorphism Counts","summary":" Graph homomorphism counts, first explored by Lov\\'asz in 1967, have recently\ngarnered interest as a powerful tool in graph-based machine learning. Grohe\n(PODS 2020) proposed the theoretical foundations for using homomorphism counts\nin machine learning on graph level as well as node level tasks. By their very\nnature, these capture local structural information, which enables the creation\nof robust structural embeddings. While a first approach for graph level tasks\nhas been made by Nguyen and Maehara (ICML 2020), we experimentally show the\neffectiveness of homomorphism count based node embeddings. Enriched with node\nlabels, node weights, and edge weights, these offer an interpretable\nrepresentation of graph data, allowing for enhanced explainability of machine\nlearning models.\n We propose a theoretical framework for isomorphism-invariant homomorphism\ncount based embeddings which lend themselves to a wide variety of downstream\ntasks. Our approach capitalises on the efficient computability of graph\nhomomorphism counts for bounded treewidth graph classes, rendering it a\npractical solution for real-world applications. We demonstrate their\nexpressivity through experiments on benchmark datasets. Although our results do\nnot match the accuracy of state-of-the-art neural architectures, they are\ncomparable to other advanced graph learning models. Remarkably, our approach\ndemarcates itself by ensuring explainability for each individual feature. By\nintegrating interpretable machine learning algorithms like SVMs or Random\nForests, we establish a seamless, end-to-end explainable pipeline. Our study\ncontributes to the advancement of graph-based techniques that offer both\nperformance and interpretability.\n","authors":["Hinrikus Wolf","Luca Oeljeklaus","Pascal Kühner","Martin Grohe"],"pdf_url":"https://arxiv.org/pdf/2308.15283v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07750v2","updated":"2023-11-20T15:01:19Z","published":"2023-11-13T21:07:07Z","title":"SynthEnsemble: A Fusion of CNN, Vision Transformer, and Hybrid Models\n for Multi-Label Chest X-Ray Classification","summary":" Chest X-rays are widely used to diagnose thoracic diseases, but the lack of\ndetailed information about these abnormalities makes it challenging to develop\naccurate automated diagnosis systems, which is crucial for early detection and\neffective treatment. To address this challenge, we employed deep learning\ntechniques to identify patterns in chest X-rays that correspond to different\ndiseases. We conducted experiments on the \"ChestX-ray14\" dataset using various\npre-trained CNNs, transformers, hybrid(CNN+Transformer) models and classical\nmodels. The best individual model was the CoAtNet, which achieved an area under\nthe receiver operating characteristic curve (AUROC) of 84.2%. By combining the\npredictions of all trained models using a weighted average ensemble where the\nweight of each model was determined using differential evolution, we further\nimproved the AUROC to 85.4%, outperforming other state-of-the-art methods in\nthis field. Our findings demonstrate the potential of deep learning techniques,\nparticularly ensemble deep learning, for improving the accuracy of automatic\ndiagnosis of thoracic diseases from chest X-rays.\n","authors":["S. M. Nabil Ashraf","Md. Adyelullahil Mamun","Hasnat Md. Abdullah","Md. Golam Rabiul Alam"],"pdf_url":"https://arxiv.org/pdf/2311.07750v2.pdf","comment":"Accepted in International Conference on Computer and Information\n Technology (ICCIT) 2023"},{"id":"http://arxiv.org/abs/2311.11822v1","updated":"2023-11-20T14:58:56Z","published":"2023-11-20T14:58:56Z","title":"Zero redundancy distributed learning with differential privacy","summary":" Deep learning using large models have achieved great success in a wide range\nof domains. However, training these models on billions of parameters is very\nchallenging in terms of the training speed, memory cost, and communication\nefficiency, especially under the privacy-preserving regime with differential\nprivacy (DP). On the one hand, DP optimization has comparable efficiency to the\nstandard non-private optimization on a single GPU, but on multiple GPUs,\nexisting DP distributed learning (such as pipeline parallel) has suffered from\nsignificantly worse efficiency. On the other hand, the Zero Redundancy\nOptimizer (ZeRO) is a state-of-the-art solution to the standard distributed\nlearning, exhibiting excellent training efficiency on large models, but to work\ncompatibly with DP is technically complicated. In this work, we develop a new\nsystematic solution, DP-ZeRO, (I) to scale up the trainable DP model size, e.g.\nto GPT-100B, (II) to obtain the same computation and communication efficiency\nas the standard ZeRO, and (III) to enable mixed-precision DP training. Our\nDP-ZeRO, like the standard ZeRO, has the potential to train models with\narbitrary size and is evaluated on the world's largest DP models in terms of\nthe number of trainable parameters.\n","authors":["Zhiqi Bu","Justin Chiu","Ruixuan Liu","Sheng Zha","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2311.11822v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11821v1","updated":"2023-11-20T14:58:47Z","published":"2023-11-20T14:58:47Z","title":"Cross-View Graph Consistency Learning for Invariant Graph\n Representations","summary":" Graph representation learning is fundamental for analyzing graph-structured\ndata. Exploring invariant graph representations remains a challenge for most\nexisting graph representation learning methods. In this paper, we propose a\ncross-view graph consistency learning (CGCL) method that learns invariant graph\nrepresentations for link prediction. First, two complementary augmented views\nare derived from an incomplete graph structure through a bidirectional graph\nstructure augmentation scheme. This augmentation scheme mitigates the potential\ninformation loss that is commonly associated with various data augmentation\ntechniques involving raw graph data, such as edge perturbation, node removal,\nand attribute masking. Second, we propose a CGCL model that can learn invariant\ngraph representations. A cross-view training scheme is proposed to train the\nproposed CGCL model. This scheme attempts to maximize the consistency\ninformation between one augmented view and the graph structure reconstructed\nfrom the other augmented view. Furthermore, we offer a comprehensive\ntheoretical CGCL analysis. This paper empirically and experimentally\ndemonstrates the effectiveness of the proposed CGCL method, achieving\ncompetitive results on graph datasets in comparisons with several\nstate-of-the-art algorithms.\n","authors":["Jie Chen","Zhiming Li","Hua Mao","Wai Lok Woo","Xi Peng"],"pdf_url":"https://arxiv.org/pdf/2311.11821v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.11819v1","updated":"2023-11-20T14:55:40Z","published":"2023-11-20T14:55:40Z","title":"Generalized super-resolution 4D Flow MRI -- using ensemble learning to\n extend across the cardiovascular system","summary":" 4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive\nmeasurement technique capable of quantifying blood flow across the\ncardiovascular system. While practical use is limited by spatial resolution and\nimage noise, incorporation of trained super-resolution (SR) networks has\npotential to enhance image quality post-scan. However, these efforts have\npredominantly been restricted to narrowly defined cardiovascular domains, with\nlimited exploration of how SR performance extends across the cardiovascular\nsystem; a task aggravated by contrasting hemodynamic conditions apparent across\nthe cardiovasculature. The aim of our study was to explore the generalizability\nof SR 4D Flow MRI using a combination of heterogeneous training sets and\ndedicated ensemble learning. With synthetic training data generated across\nthree disparate domains (cardiac, aortic, cerebrovascular), varying\nconvolutional base and ensemble learners were evaluated as a function of domain\nand architecture, quantifying performance on both in-silico and acquired\nin-vivo data from the same three domains. Results show that both bagging and\nstacking ensembling enhance SR performance across domains, accurately\npredicting high-resolution velocities from low-resolution input data in-silico.\nLikewise, optimized networks successfully recover native resolution velocities\nfrom downsampled in-vivo data, as well as show qualitative potential in\ngenerating denoised SR-images from clinical level input data. In conclusion,\nour work presents a viable approach for generalized SR 4D Flow MRI, with\nensemble learning extending utility across various clinical areas of interest.\n","authors":["Leon Ericsson","Adam Hjalmarsson","Muhammad Usman Akbar","Edward Ferdian","Mia Bonini","Brandon Hardy","Jonas Schollenberger","Maria Aristova","Patrick Winter","Nicholas Burris","Alexander Fyrdahl","Andreas Sigfridsson","Susanne Schnell","C. Alberto Figueroa","David Nordsletten","Alistair A. Young","David Marlevi"],"pdf_url":"https://arxiv.org/pdf/2311.11819v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.08278v2","updated":"2023-11-20T14:54:09Z","published":"2023-10-12T12:29:32Z","title":"Lag-Llama: Towards Foundation Models for Time Series Forecasting","summary":" Aiming to build foundation models for time-series forecasting and study their\nscaling behavior, we present here our work-in-progress on Lag-Llama, a\ngeneral-purpose univariate probabilistic time-series forecasting model trained\non a large collection of time-series data. The model shows good zero-shot\nprediction capabilities on unseen \"out-of-distribution\" time-series datasets,\noutperforming supervised baselines. We use smoothly broken power-laws to fit\nand predict model scaling behavior. The open source code is made available at\nhttps://github.com/kashif/pytorch-transformer-ts.\n","authors":["Kashif Rasul","Arjun Ashok","Andrew Robert Williams","Arian Khorasani","George Adamopoulos","Rishika Bhagwatkar","Marin Biloš","Hena Ghonia","Nadhir Vincent Hassen","Anderson Schneider","Sahil Garg","Alexandre Drouin","Nicolas Chapados","Yuriy Nevmyvaka","Irina Rish"],"pdf_url":"https://arxiv.org/pdf/2310.08278v2.pdf","comment":"Preliminary Draft. Accepted at NeurIPS 2023 R0-FoMo Workshop. Full\n paper coming soon with comprehensive results and open-source model\n checkpoints"},{"id":"http://arxiv.org/abs/2311.11809v1","updated":"2023-11-20T14:42:13Z","published":"2023-11-20T14:42:13Z","title":"LogLead -- Fast and Integrated Log Loader, Enhancer, and Anomaly\n Detector","summary":" This paper introduces LogLead, a tool designed for efficient log analysis.\nLogLead combines three essential steps in log processing: loading, enhancing,\nand anomaly detection. The tool leverages Polars, a high-speed DataFrame\nlibrary. We currently have 7 Loaders out of which 4 is for public data sets\n(HDFS, Hadoop, BGL, and Thunderbird). We have multiple enhancers with three\nparsers (Drain, Spell, LenMa), Bert embedding creation and other log\nrepresentation techniques like bag-of-words. LogLead integrates to 5 supervised\nand 4 unsupervised machine learning algorithms for anomaly detection from\nSKLearn. By integrating diverse datasets, log representation methods and\nanomaly detectors, LogLead facilitates comprehensive benchmarking in log\nanalysis research. We demonstrate that log loading from raw file to dataframe\nis over 10x faster with LogLead is compared to past solutions. We demonstrate\nroughly 2x improvement in Drain parsing speed by off-loading log message\nnormalization to LogLead. We demonstrate a brief benchmarking on HDFS\nsuggesting that log representations beyond bag-of-words provide limited\nbenefits. Screencast demonstrating the tool: https://youtu.be/8stdbtTfJVo\n","authors":["Mika Mäntylä","Yuqing Wang","Jesse Nyyssölä"],"pdf_url":"https://arxiv.org/pdf/2311.11809v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11798v1","updated":"2023-11-20T14:31:18Z","published":"2023-11-20T14:31:18Z","title":"Operator Learning for Continuous Spatial-Temporal Model with A Hybrid\n Optimization Scheme","summary":" Partial differential equations are often used in the spatial-temporal\nmodeling of complex dynamical systems in many engineering applications. In this\nwork, we build on the recent progress of operator learning and present a\ndata-driven modeling framework that is continuous in both space and time. A key\nfeature of the proposed model is the resolution-invariance with respect to both\nspatial and temporal discretizations. To improve the long-term performance of\nthe calibrated model, we further propose a hybrid optimization scheme that\nleverages both gradient-based and derivative-free optimization methods and\nefficiently trains on both short-term time series and long-term statistics. We\ninvestigate the performance of the spatial-temporal continuous learning\nframework with three numerical examples, including the viscous Burgers'\nequation, the Navier-Stokes equations, and the Kuramoto-Sivashinsky equation.\nThe results confirm the resolution-invariance of the proposed modeling\nframework and also demonstrate stable long-term simulations with only\nshort-term time series data. In addition, we show that the proposed model can\nbetter predict long-term statistics via the hybrid optimization scheme with a\ncombined use of short-term and long-term data.\n","authors":["Chuanqi Chen","Jin-Long Wu"],"pdf_url":"https://arxiv.org/pdf/2311.11798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16854v2","updated":"2023-11-20T14:27:39Z","published":"2023-05-26T12:04:59Z","title":"Channel and Gradient-Importance Aware Device Scheduling for Over-the-Air\n Federated Learning","summary":" Federated learning (FL) is a popular privacy-preserving distributed training\nscheme, where multiple devices collaborate to train machine learning models by\nuploading local model updates. To improve communication efficiency,\nover-the-air computation (AirComp) has been applied to FL, which leverages\nanalog modulation to harness the superposition property of radio waves such\nthat numerous devices can upload their model updates concurrently for\naggregation. However, the uplink channel noise incurs considerable model\naggregation distortion, which is critically determined by the device scheduling\nand compromises the learned model performance. In this paper, we propose a\nprobabilistic device scheduling framework for over-the-air FL, named PO-FL, to\nmitigate the negative impact of channel noise, where each device is scheduled\naccording to a certain probability and its model update is reweighted using\nthis probability in aggregation. We prove the unbiasedness of this aggregation\nscheme and demonstrate the convergence of PO-FL on both convex and non-convex\nloss functions. Our convergence bounds unveil that the device scheduling\naffects the learning performance through the communication distortion and\nglobal update variance. Based on the convergence analysis, we further develop a\nchannel and gradient-importance aware algorithm to optimize the device\nscheduling probabilities in PO-FL. Extensive simulation results show that the\nproposed PO-FL framework with channel and gradient-importance awareness\nachieves faster convergence and produces better models than baseline methods.\n","authors":["Yuchang Sun","Zehong lin","Yuyi Mao","Shi Jin","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.16854v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11789v1","updated":"2023-11-20T14:14:13Z","published":"2023-11-20T14:14:13Z","title":"Approximate Linear Programming and Decentralized Policy Improvement in\n Cooperative Multi-agent Markov Decision Processes","summary":" In this work, we consider a `cooperative' multi-agent Markov decision process\n(MDP) involving m greater than 1 agents, where all agents are aware of the\nsystem model. At each decision epoch, all the m agents cooperatively select\nactions in order to maximize a common long-term objective. Since the number of\nactions grows exponentially in the number of agents, policy improvement is\ncomputationally expensive. Recent works have proposed using decentralized\npolicy improvement in which each agent assumes that the decisions of the other\nagents are fixed and it improves its decisions unilaterally. Yet, in these\nworks, exact values are computed. In our work, for cooperative multi-agent\nfinite and infinite horizon discounted MDPs, we propose suitable approximate\npolicy iteration algorithms, wherein we use approximate linear programming to\ncompute the approximate value function and use decentralized policy\nimprovement. Thus our algorithms can handle both large number of states as well\nas multiple agents. We provide theoretical guarantees for our algorithms and\nalso demonstrate the performance of our algorithms on some numerical examples.\n","authors":["Lakshmi Mandal","Chandrashekar Lakshminarayanan","Shalabh Bhatnagar"],"pdf_url":"https://arxiv.org/pdf/2311.11789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11782v1","updated":"2023-11-20T14:07:38Z","published":"2023-11-20T14:07:38Z","title":"Robust Tumor Segmentation with Hyperspectral Imaging and Graph Neural\n Networks","summary":" Segmenting the boundary between tumor and healthy tissue during surgical\ncancer resection poses a significant challenge. In recent years, Hyperspectral\nImaging (HSI) combined with Machine Learning (ML) has emerged as a promising\nsolution. However, due to the extensive information contained within the\nspectral domain, most ML approaches primarily classify individual HSI\n(super-)pixels, or tiles, without taking into account their spatial context. In\nthis paper, we propose an improved methodology that leverages the spatial\ncontext of tiles for more robust and smoother segmentation. To address the\nirregular shapes of tiles, we utilize Graph Neural Networks (GNNs) to propagate\ncontext information across neighboring regions. The features for each tile\nwithin the graph are extracted using a Convolutional Neural Network (CNN),\nwhich is trained simultaneously with the subsequent GNN. Moreover, we\nincorporate local image quality metrics into the loss function to enhance the\ntraining procedure's robustness against low-quality regions in the training\nimages. We demonstrate the superiority of our proposed method using a clinical\nex vivo dataset consisting of 51 HSI images from 30 patients. Despite the\nlimited dataset, the GNN-based model significantly outperforms context-agnostic\napproaches, accurately distinguishing between healthy and tumor tissues, even\nin images from previously unseen patients. Furthermore, we show that our\ncarefully designed loss function, accounting for local image quality, results\nin additional improvements. Our findings demonstrate that context-aware GNN\nalgorithms can robustly find tumor demarcations on HSI images, ultimately\ncontributing to better surgery success and patient outcome.\n","authors":["Mayar Lotfy","Anna Alperovich","Tommaso Giannantonio","Bjorn Barz","Xiaohan Zhang","Felix Holm","Nassir Navab","Felix Boehm","Carolin Schwamborn","Thomas K. Hoffmann","Patrick J. Schuler"],"pdf_url":"https://arxiv.org/pdf/2311.11782v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.11777v1","updated":"2023-11-20T14:02:50Z","published":"2023-11-20T14:02:50Z","title":"Multimodal deep learning for mapping forest dominant height by fusing\n GEDI with earth observation data","summary":" The integration of multisource remote sensing data and deep learning models\noffers new possibilities for accurately mapping high spatial resolution forest\nheight. We found that GEDI relative heights (RH) metrics exhibited strong\ncorrelation with the mean of the top 10 highest trees (dominant height)\nmeasured in situ at the corresponding footprint locations. Consequently, we\nproposed a novel deep learning framework termed the multi-modal attention\nremote sensing network (MARSNet) to estimate forest dominant height by\nextrapolating dominant height derived from GEDI, using Setinel-1 data, ALOS-2\nPALSAR-2 data, Sentinel-2 optical data and ancillary data. MARSNet comprises\nseparate encoders for each remote sensing data modality to extract multi-scale\nfeatures, and a shared decoder to fuse the features and estimate height. Using\nindividual encoders for each remote sensing imagery avoids interference across\nmodalities and extracts distinct representations. To focus on the efficacious\ninformation from each dataset, we reduced the prevalent spatial and band\nredundancies in each remote sensing data by incorporating the extended spatial\nand band reconstruction convolution modules in the encoders. MARSNet achieved\ncommendable performance in estimating dominant height, with an R2 of 0.62 and\nRMSE of 2.82 m, outperforming the widely used random forest approach which\nattained an R2 of 0.55 and RMSE of 3.05 m. Finally, we applied the trained\nMARSNet model to generate wall-to-wall maps at 10 m resolution for Jilin,\nChina. Through independent validation using field measurements, MARSNet\ndemonstrated an R2 of 0.58 and RMSE of 3.76 m, compared to 0.41 and 4.37 m for\nthe random forest baseline. Our research demonstrates the effectiveness of a\nmultimodal deep learning approach fusing GEDI with SAR and passive optical\nimagery for enhancing the accuracy of high resolution dominant height\nestimation.\n","authors":["Man Chen","Wenquan Dong","Hao Yu","Iain Woodhouse","Casey M. Ryan","Haoyu Liu","Selena Georgiou","Edward T. A. Mitchard"],"pdf_url":"https://arxiv.org/pdf/2311.11777v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15363v4","updated":"2023-11-20T13:59:16Z","published":"2023-08-29T14:59:54Z","title":"Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation","summary":" Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL\ntask. However, the absence of a systematical benchmark inhibits the development\nof designing effective, efficient and economic LLM-based Text-to-SQL solutions.\nTo address this challenge, in this paper, we first conduct a systematical and\nextensive comparison over existing prompt engineering methods, including\nquestion representation, example selection and example organization, and with\nthese experimental results, we elaborate their pros and cons. Based on these\nfindings, we propose a new integrated solution, named DAIL-SQL, which refreshes\nthe Spider leaderboard with 86.6% execution accuracy and sets a new bar. To\nexplore the potential of open-source LLM, we investigate them in various\nscenarios, and further enhance their performance with supervised fine-tuning.\nOur explorations highlight open-source LLMs' potential in Text-to-SQL, as well\nas the advantages and disadvantages of the supervised fine-tuning.\nAdditionally, towards an efficient and economic LLM-based Text-to-SQL solution,\nwe emphasize the token efficiency in prompt engineering and compare the prior\nstudies under this metric. We hope that our work provides a deeper\nunderstanding of Text-to-SQL with LLMs, and inspires further investigations and\nbroad applications.\n","authors":["Dawei Gao","Haibin Wang","Yaliang Li","Xiuyu Sun","Yichen Qian","Bolin Ding","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.15363v4.pdf","comment":"We have released code on https://github.com/BeachWang/DAIL-SQL"},{"id":"http://arxiv.org/abs/2311.11772v1","updated":"2023-11-20T13:58:26Z","published":"2023-11-20T13:58:26Z","title":"A Good Feature Extractor Is All You Need for Weakly Supervised Learning\n in Histopathology","summary":" Deep learning is revolutionising pathology, offering novel opportunities in\ndisease prognosis and personalised treatment. Historically, stain normalisation\nhas been a crucial preprocessing step in computational pathology pipelines, and\npersists into the deep learning era. Yet, with the emergence of feature\nextractors trained using self-supervised learning (SSL) on diverse pathology\ndatasets, we call this practice into question. In an empirical evaluation of\npublicly available feature extractors, we find that omitting stain\nnormalisation and image augmentations does not compromise downstream\nperformance, while incurring substantial savings in memory and compute.\nFurther, we show that the top-performing feature extractors are remarkably\nrobust to variations in stain and augmentations like rotation in their latent\nspace. Contrary to previous patch-level benchmarking studies, our approach\nemphasises clinical relevance by focusing on slide-level prediction tasks in a\nweakly supervised setting with external validation cohorts. This work\nrepresents the most comprehensive robustness evaluation of public pathology SSL\nfeature extractors to date, involving more than 6,000 training runs across nine\ntasks, five datasets, three downstream architectures, and various preprocessing\nsetups. Our findings stand to streamline digital pathology workflows by\nminimising preprocessing needs and informing the selection of feature\nextractors.\n","authors":["Georg Wölflein","Dyke Ferber","Asier Rabasco Meneghetti","Omar S. M. El Nahhas","Daniel Truhn","Zunamys I. Carrero","David J. Harrison","Ognjen Arandjelović","Jakob N. Kather"],"pdf_url":"https://arxiv.org/pdf/2311.11772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2204.09369v2","updated":"2023-11-20T13:55:08Z","published":"2022-04-20T10:18:39Z","title":"A Variational Autoencoder for Heterogeneous Temporal and Longitudinal\n Data","summary":" The variational autoencoder (VAE) is a popular deep latent variable model\nused to analyse high-dimensional datasets by learning a low-dimensional latent\nrepresentation of the data. It simultaneously learns a generative model and an\ninference network to perform approximate posterior inference. Recently proposed\nextensions to VAEs that can handle temporal and longitudinal data have\napplications in healthcare, behavioural modelling, and predictive maintenance.\nHowever, these extensions do not account for heterogeneous data (i.e., data\ncomprising of continuous and discrete attributes), which is common in many\nreal-life applications. In this work, we propose the heterogeneous longitudinal\nVAE (HL-VAE) that extends the existing temporal and longitudinal VAEs to\nheterogeneous data. HL-VAE provides efficient inference for high-dimensional\ndatasets and includes likelihood models for continuous, count, categorical, and\nordinal data while accounting for missing observations. We demonstrate our\nmodel's efficacy through simulated as well as clinical datasets, and show that\nour proposed model achieves competitive performance in missing value imputation\nand predictive accuracy.\n","authors":["Mine Öğretir","Siddharth Ramchandran","Dimitrios Papatheodorou","Harri Lähdesmäki"],"pdf_url":"https://arxiv.org/pdf/2204.09369v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09115v2","updated":"2023-11-20T13:55:04Z","published":"2023-11-15T17:06:26Z","title":"HEALNet -- Hybrid Multi-Modal Fusion for Heterogeneous Biomedical Data","summary":" Technological advances in medical data collection such as high-resolution\nhistopathology and high-throughput genomic sequencing have contributed to the\nrising requirement for multi-modal biomedical modelling, specifically for\nimage, tabular, and graph data. Most multi-modal deep learning approaches use\nmodality-specific architectures that are trained separately and cannot capture\nthe crucial cross-modal information that motivates the integration of different\ndata sources. This paper presents the Hybrid Early-fusion Attention Learning\nNetwork (HEALNet): a flexible multi-modal fusion architecture, which a)\npreserves modality-specific structural information, b) captures the cross-modal\ninteractions and structural information in a shared latent space, c) can\neffectively handle missing modalities during training and inference, and d)\nenables intuitive model inspection by learning on the raw data input instead of\nopaque embeddings. We conduct multi-modal survival analysis on Whole Slide\nImages and Multi-omic data on four cancer cohorts of The Cancer Genome Atlas\n(TCGA). HEALNet achieves state-of-the-art performance, substantially improving\nover both uni-modal and recent multi-modal baselines, whilst being robust in\nscenarios with missing modalities.\n","authors":["Konstantin Hemker","Nikola Simidjievski","Mateja Jamnik"],"pdf_url":"https://arxiv.org/pdf/2311.09115v2.pdf","comment":"7 pages body, 5 pages appendix"},{"id":"http://arxiv.org/abs/2209.07067v4","updated":"2023-11-20T13:49:25Z","published":"2022-09-15T05:56:36Z","title":"Efficient learning of nonlinear prediction models with time-series\n privileged information","summary":" In domains where sample sizes are limited, efficient learning algorithms are\ncritical. Learning using privileged information (LuPI) offers increased sample\nefficiency by allowing prediction models access to auxiliary information at\ntraining time which is unavailable when the models are used. In recent work, it\nwas shown that for prediction in linear-Gaussian dynamical systems, a LuPI\nlearner with access to intermediate time series data is never worse and often\nbetter in expectation than any unbiased classical learner. We provide new\ninsights into this analysis and generalize it to nonlinear prediction tasks in\nlatent dynamical systems, extending theoretical guarantees to the case where\nthe map connecting latent variables and observations is known up to a linear\ntransform. In addition, we propose algorithms based on random features and\nrepresentation learning for the case when this map is unknown. A suite of\nempirical results confirm theoretical findings and show the potential of using\nprivileged time-series information in nonlinear prediction.\n","authors":["Bastian Jung","Fredrik D Johansson"],"pdf_url":"https://arxiv.org/pdf/2209.07067v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.07606v2","updated":"2023-11-20T13:48:09Z","published":"2020-07-15T10:32:43Z","title":"timeXplain -- A Framework for Explaining the Predictions of Time Series\n Classifiers","summary":" Modern time series classifiers display impressive predictive capabilities,\nyet their decision-making processes mostly remain black boxes to the user. At\nthe same time, model-agnostic explainers, such as the recently proposed SHAP,\npromise to make the predictions of machine learning models interpretable,\nprovided there are well-designed domain mappings. We bring both worlds together\nin our timeXplain framework, extending the reach of explainable artificial\nintelligence to time series classification and value prediction. We present\nnovel domain mappings for the time domain, frequency domain, and time series\nstatistics and analyze their explicative power as well as their limits. We\nemploy a novel evaluation metric to experimentally compare timeXplain to\nseveral model-specific explanation approaches for state-of-the-art time series\nclassifiers.\n","authors":["Felix Mujkanovic","Vanja Doskoč","Martin Schirneck","Patrick Schäfer","Tobias Friedrich"],"pdf_url":"https://arxiv.org/pdf/2007.07606v2.pdf","comment":"9 pages; published code, added combined time slice and frequency band\n mapping, added quantitative evaluation and comparison to model-specific\n explainers"},{"id":"http://arxiv.org/abs/2306.08744v2","updated":"2023-11-20T13:42:20Z","published":"2023-06-14T21:01:35Z","title":"High-performance deep spiking neural networks with 0.3 spikes per neuron","summary":" Communication by rare, binary spikes is a key factor for the energy\nefficiency of biological brains. However, it is harder to train\nbiologically-inspired spiking neural networks (SNNs) than artificial neural\nnetworks (ANNs). This is puzzling given that theoretical results provide exact\nmapping algorithms from ANNs to SNNs with time-to-first-spike (TTFS) coding. In\nthis paper we analyze in theory and simulation the learning dynamics of\nTTFS-networks and identify a specific instance of the vanishing-or-exploding\ngradient problem. While two choices of SNN mappings solve this problem at\ninitialization, only the one with a constant slope of the neuron membrane\npotential at threshold guarantees the equivalence of the training trajectory\nbetween SNNs and ANNs with rectified linear units. We demonstrate that training\ndeep SNN models achieves the exact same performance as that of ANNs, surpassing\nprevious SNNs on image classification datasets such as MNIST/Fashion-MNIST,\nCIFAR10/CIFAR100 and PLACES365. Our SNN accomplishes high-performance\nclassification with less than 0.3 spikes per neuron, lending itself for an\nenergy-efficient implementation. We show that fine-tuning SNNs with our robust\ngradient descent algorithm enables their optimization for hardware\nimplementations with low latency and resilience to noise and quantization.\n","authors":["Ana Stanojevic","Stanisław Woźniak","Guillaume Bellec","Giovanni Cherubini","Angeliki Pantazi","Wulfram Gerstner"],"pdf_url":"https://arxiv.org/pdf/2306.08744v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11762v1","updated":"2023-11-20T13:40:40Z","published":"2023-11-20T13:40:40Z","title":"MUVO: A Multimodal Generative World Model for Autonomous Driving with\n Geometric Representations","summary":" Learning unsupervised world models for autonomous driving has the potential\nto improve the reasoning capabilities of today's systems dramatically. However,\nmost work neglects the physical attributes of the world and focuses on sensor\ndata alone. We propose MUVO, a MUltimodal World Model with Geometric VOxel\nRepresentations to address this challenge. We utilize raw camera and lidar data\nto learn a sensor-agnostic geometric representation of the world, which can\ndirectly be used by downstream tasks, such as planning. We demonstrate\nmultimodal future predictions and show that our geometric representation\nimproves the prediction quality of both camera images and lidar point clouds.\n","authors":["Daniel Bogdoll","Yitian Yang","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.11762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11759v1","updated":"2023-11-20T13:39:19Z","published":"2023-11-20T13:39:19Z","title":"Unveiling the Unseen Potential of Graph Learning through MLPs: Effective\n Graph Learners Using Propagation-Embracing MLPs","summary":" Recent studies attempted to utilize multilayer perceptrons (MLPs) to solve\nsemi-supervised node classification on graphs, by training a student MLP by\nknowledge distillation (KD) from a teacher graph neural network (GNN). While\nprevious studies have focused mostly on training the student MLP by matching\nthe output probability distributions between the teacher and student models\nduring KD, it has not been systematically studied how to inject the structural\ninformation in an explicit and interpretable manner. Inspired by GNNs that\nseparate feature transformation $T$ and propagation $\\Pi$, we re-frame the KD\nprocess as enabling the student MLP to explicitly learn both $T$ and $\\Pi$.\nAlthough this can be achieved by applying the inverse propagation $\\Pi^{-1}$\nbefore distillation from the teacher GNN, it still comes with a high\ncomputational cost from large matrix multiplications during training. To solve\nthis problem, we propose Propagate & Distill (P&D), which propagates the output\nof the teacher GNN before KD and can be interpreted as an approximate process\nof the inverse propagation $\\Pi^{-1}$. Through comprehensive evaluations using\nreal-world benchmark datasets, we demonstrate the effectiveness of P&D by\nshowing further performance boost of the student MLP.\n","authors":["Yong-Min Shin","Won-Yong Shin"],"pdf_url":"https://arxiv.org/pdf/2311.11759v1.pdf","comment":"35 pages, 5 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.10489v2","updated":"2023-11-20T13:37:58Z","published":"2023-11-17T12:41:07Z","title":"Handling Overlapping Asymmetric Datasets -- A Twice Penalized P-Spline\n Approach","summary":" Overlapping asymmetric datasets are common in data science and pose questions\nof how they can be incorporated together into a predictive analysis. In\nhealthcare datasets there is often a small amount of information that is\navailable for a larger number of patients such as an electronic health record,\nhowever a small number of patients may have had extensive further testing.\nCommon solutions such as missing imputation can often be unwise if the smaller\ncohort is significantly different in scale to the larger sample, therefore the\naim of this research is to develop a new method which can model the smaller\ncohort against a particular response, whilst considering the larger cohort\nalso. Motivated by non-parametric models, and specifically flexible smoothing\ntechniques via generalized additive models, we model a twice penalized P-Spline\napproximation method to firstly prevent over/under-fitting of the smaller\ncohort and secondly to consider the larger cohort. This second penalty is\ncreated through discrepancies in the marginal value of covariates that exist in\nboth the smaller and larger cohorts. Through data simulations, parameter\ntunings and model adaptations to consider a continuous and binary response, we\nfind our twice penalized approach offers an enhanced fit over a linear B-Spline\nand once penalized P-Spline approximation. Applying to a real-life dataset\nrelating to a person's risk of developing Non-Alcoholic Steatohepatitis, we see\nan improved model fit performance of over 65%. Areas for future work within\nthis space include adapting our method to not require dimensionality reduction\nand also consider parametric modelling methods. However, to our knowledge this\nis the first work to propose additional marginal penalties in a flexible\nregression of which we can report a vastly improved model fit that is able to\nconsider asymmetric datasets, without the need for missing data imputation.\n","authors":["Matthew McTeer","Robin Henderson","Quentin M Anstee","Paolo Missier"],"pdf_url":"https://arxiv.org/pdf/2311.10489v2.pdf","comment":"52 pages, 17 figures, 8 tables, 34 references"},{"id":"http://arxiv.org/abs/2309.10003v3","updated":"2023-11-20T13:31:47Z","published":"2023-09-17T16:50:07Z","title":"A novel approach to measuring patent claim scope based on probabilities\n obtained from (large) language models","summary":" This work proposes to measure the scope of a patent claim as the reciprocal\nof the self-information contained in this claim. A probability of occurrence of\nthe claim is obtained from a language model and this probability is used to\ncompute the self-information. Grounded in information theory, this approach is\nbased on the assumption that an unlikely concept is more informative than a\nusual concept, insofar as it is more surprising. In turn, the more surprising\nthe information required to defined the claim, the narrower its scope. Five\nlanguage models are considered, ranging from simplest models (each word or\ncharacter is assigned an identical probability) to intermediate models (using\naverage word or character frequencies), to a large language model (GPT2).\nInterestingly, the scope resulting from the simplest language models is\nproportional to the reciprocal of the number of words or characters involved in\nthe claim, a metric already used in previous works. Application is made to\nmultiple series of patent claims directed to distinct inventions, where each\nseries consists of claims devised to have a gradually decreasing scope. The\nperformance of the language models is assessed with respect to several ad hoc\ntests. The more sophisticated the model, the better the results. I.e., the GPT2\nprobability model outperforms models based on word and character frequencies,\nwhich themselves outdo the simplest models based on word or character counts.\nStill, the character count appears to be a more reliable indicator than the\nword count.\n","authors":["Sébastien Ragot"],"pdf_url":"https://arxiv.org/pdf/2309.10003v3.pdf","comment":"58 pages, 8 tables, 6 figures. Substantial changes made to version 2:\n New section 4.1 added (including a new table); Minor normalization issue\n corrected in values listed in Appendix B; Content of former appendix C now\n moved to Section 3; and new Appendix C added. Minor changes made to version 3\n (style, typos, language)"},{"id":"http://arxiv.org/abs/2311.11749v1","updated":"2023-11-20T13:21:10Z","published":"2023-11-20T13:21:10Z","title":"Revealing behavioral impact on mobility prediction networks through\n causal interventions","summary":" Deep neural networks are increasingly utilized in mobility prediction tasks,\nyet their intricate internal workings pose challenges for interpretability,\nespecially in comprehending how various aspects of mobility behavior affect\npredictions. In this study, we introduce a causal intervention framework to\nassess the impact of mobility-related factors on neural networks designed for\nnext location prediction -- a task focusing on predicting the immediate next\nlocation of an individual. To achieve this, we employ individual mobility\nmodels to generate synthetic location visit sequences and control behavior\ndynamics by intervening in their data generation process. We evaluate the\ninterventional location sequences using mobility metrics and input them into\nwell-trained networks to analyze performance variations. The results\ndemonstrate the effectiveness in producing location sequences with distinct\nmobility behaviors, thus facilitating the simulation of diverse spatial and\ntemporal changes. These changes result in performance fluctuations in next\nlocation prediction networks, revealing impacts of critical mobility behavior\nfactors, including sequential patterns in location transitions, proclivity for\nexploring new locations, and preferences in location choices at population and\nindividual levels. The gained insights hold significant value for the\nreal-world application of mobility prediction networks, and the framework is\nexpected to promote the use of causal inference for enhancing the\ninterpretability and robustness of neural networks in mobility applications.\n","authors":["Ye Hong","Yanan Xin","Simon Dirmeier","Fernando Perez-Cruz","Martin Raubal"],"pdf_url":"https://arxiv.org/pdf/2311.11749v1.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.18534v2","updated":"2023-11-20T13:11:01Z","published":"2023-10-27T23:18:44Z","title":"Multi Time Scale World Models","summary":" Intelligent agents use internal world models to reason and make predictions\nabout different courses of their actions at many scales. Devising learning\nparadigms and architectures that allow machines to learn world models that\noperate at multiple levels of temporal abstractions while dealing with complex\nuncertainty predictions is a major technical hurdle. In this work, we propose a\nprobabilistic formalism to learn multi-time scale world models which we call\nthe Multi Time Scale State Space (MTS3) model. Our model uses a computationally\nefficient inference scheme on multiple time scales for highly accurate\nlong-horizon predictions and uncertainty estimates over several seconds into\nthe future. Our experiments, which focus on action conditional long horizon\nfuture predictions, show that MTS3 outperforms recent methods on several system\nidentification benchmarks including complex simulated and real-world dynamical\nsystems.\n","authors":["Vaisakh Shaj","Saleh Gholam Zadeh","Ozan Demir","Luiz Ricardo Douat","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2310.18534v2.pdf","comment":"Accepted as spotlight at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.01310v2","updated":"2023-11-20T13:08:27Z","published":"2023-11-02T15:24:23Z","title":"Scattering Vision Transformer: Spectral Mixing Matters","summary":" Vision transformers have gained significant attention and achieved\nstate-of-the-art performance in various computer vision tasks, including image\nclassification, instance segmentation, and object detection. However,\nchallenges remain in addressing attention complexity and effectively capturing\nfine-grained information within images. Existing solutions often resort to\ndown-sampling operations, such as pooling, to reduce computational cost.\nUnfortunately, such operations are non-invertible and can result in information\nloss. In this paper, we present a novel approach called Scattering Vision\nTransformer (SVT) to tackle these challenges. SVT incorporates a spectrally\nscattering network that enables the capture of intricate image details. SVT\novercomes the invertibility issue associated with down-sampling operations by\nseparating low-frequency and high-frequency components. Furthermore, SVT\nintroduces a unique spectral gating network utilizing Einstein multiplication\nfor token and channel mixing, effectively reducing complexity. We show that SVT\nachieves state-of-the-art performance on the ImageNet dataset with a\nsignificant reduction in a number of parameters and FLOPS. SVT shows 2\\%\nimprovement over LiTv2 and iFormer. SVT-H-S reaches 84.2\\% top-1 accuracy,\nwhile SVT-H-B reaches 85.2\\% (state-of-art for base versions) and SVT-H-L\nreaches 85.7\\% (again state-of-art for large versions). SVT also shows\ncomparable results in other vision tasks such as instance segmentation. SVT\nalso outperforms other transformers in transfer learning on standard datasets\nsuch as CIFAR10, CIFAR100, Oxford Flower, and Stanford Car datasets. The\nproject page is available on this\nwebpage.\\url{https://badripatro.github.io/svt/}.\n","authors":["Badri N. Patro","Vijay Srinivas Agneeswaran"],"pdf_url":"https://arxiv.org/pdf/2311.01310v2.pdf","comment":"Accepted @NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.11723v1","updated":"2023-11-20T12:40:25Z","published":"2023-11-20T12:40:25Z","title":"Leveraging Uncertainty Estimates To Improve Classifier Performance","summary":" Binary classification involves predicting the label of an instance based on\nwhether the model score for the positive class exceeds a threshold chosen based\non the application requirements (e.g., maximizing recall for a precision\nbound). However, model scores are often not aligned with the true positivity\nrate. This is especially true when the training involves a differential\nsampling across classes or there is distributional drift between train and test\nsettings. In this paper, we provide theoretical analysis and empirical evidence\nof the dependence of model score estimation bias on both uncertainty and score\nitself. Further, we formulate the decision boundary selection in terms of both\nmodel score and uncertainty, prove that it is NP-hard, and present algorithms\nbased on dynamic programming and isotonic regression. Evaluation of the\nproposed algorithms on three real-world datasets yield 25%-40% gain in recall\nat high precision bounds over the traditional approach of using model score\nalone, highlighting the benefits of leveraging uncertainty.\n","authors":["Gundeep Arora","Srujana Merugu","Anoop Saladi","Rajeev Rastogi"],"pdf_url":"https://arxiv.org/pdf/2311.11723v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12253v2","updated":"2023-11-20T12:38:42Z","published":"2023-09-21T16:57:09Z","title":"SALSA-CLRS: A Sparse and Scalable Benchmark for Algorithmic Reasoning","summary":" We introduce an extension to the CLRS algorithmic learning benchmark,\nprioritizing scalability and the utilization of sparse representations. Many\nalgorithms in CLRS require global memory or information exchange, mirrored in\nits execution model, which constructs fully connected (not sparse) graphs based\non the underlying problem. Despite CLRS's aim of assessing how effectively\nlearned algorithms can generalize to larger instances, the existing execution\nmodel becomes a significant constraint due to its demanding memory requirements\nand runtime (hard to scale). However, many important algorithms do not demand a\nfully connected graph; these algorithms, primarily distributed in nature, align\nclosely with the message-passing paradigm employed by Graph Neural Networks.\nHence, we propose SALSA-CLRS, an extension of the current CLRS benchmark\nspecifically with scalability and sparseness in mind. Our approach includes\nadapted algorithms from the original CLRS benchmark and introduces new problems\nfrom distributed and randomized algorithms. Moreover, we perform a thorough\nempirical evaluation of our benchmark. Code is publicly available at\nhttps://github.com/jkminder/SALSA-CLRS.\n","authors":["Julian Minder","Florian Grötschla","Joël Mathys","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2309.12253v2.pdf","comment":"(Extended Abstract) Presented at the Second Learning on Graphs\n Conference (LoG 2023)"},{"id":"http://arxiv.org/abs/2310.18936v2","updated":"2023-11-20T12:35:55Z","published":"2023-10-29T08:50:27Z","title":"Adversarial Examples Are Not Real Features","summary":" The existence of adversarial examples has been a mystery for years and\nattracted much interest. A well-known theory by \\citet{ilyas2019adversarial}\nexplains adversarial vulnerability from a data perspective by showing that one\ncan extract non-robust features from adversarial examples and these features\nalone are useful for classification. However, the explanation remains quite\ncounter-intuitive since non-robust features are mostly noise features to\nhumans. In this paper, we re-examine the theory from a larger context by\nincorporating multiple learning paradigms. Notably, we find that contrary to\ntheir good usefulness under supervised learning, non-robust features attain\npoor usefulness when transferred to other self-supervised learning paradigms,\nsuch as contrastive learning, masked image modeling, and diffusion models. It\nreveals that non-robust features are not really as useful as robust or natural\nfeatures that enjoy good transferability between these paradigms. Meanwhile,\nfor robustness, we also show that naturally trained encoders from robust\nfeatures are largely non-robust under AutoAttack. Our cross-paradigm\nexamination suggests that the non-robust features are not really useful but\nmore like paradigm-wise shortcuts, and robust features alone might be\ninsufficient to attain reliable model robustness. Code is available at\n\\url{https://github.com/PKU-ML/AdvNotRealFeatures}.\n","authors":["Ang Li","Yifei Wang","Yiwen Guo","Yisen Wang"],"pdf_url":"https://arxiv.org/pdf/2310.18936v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.11717v1","updated":"2023-11-20T12:35:11Z","published":"2023-11-20T12:35:11Z","title":"Can we infer the presence of Differential Privacy in Deep Learning\n models' weights? Towards more secure Deep Learning","summary":" Differential Privacy (DP) is a key property to protect data and models from\nintegrity attacks. In the Deep Learning (DL) field, it is commonly implemented\nthrough the Differentially Private Stochastic Gradient Descent (DP-SGD).\nHowever, when a model is shared or released, there is no way to check whether\nit is differentially private, that is, it required to trust the model provider.\nThis situation poses a problem when data privacy is mandatory, specially with\ncurrent data regulations, as the presence of DP can not be certificated\nconsistently by any third party. Thus, we face the challenge of determining\nwhether a DL model has been trained with DP, according to the title question:\nCan we infer the presence of Differential Privacy in Deep Learning models'\nweights? Since the DP-SGD significantly changes the training process of a DL\nmodel, we hypothesize that DP leaves an imprint in the weights of a DL model,\nwhich can be used to predict whether a model has been trained with DP\nregardless of its architecture and the training dataset. In this paper, we\npropose to employ the imprint in model weights of using DP to infer the\npresence of DP training in a DL model. To substantiate our hypothesis, we\ndeveloped an experimental methodology based on two datasets of weights of DL\nmodels, each with models with and without DP training and a meta-classifier to\ninfer whether DP was used in the training process of a DL model, by accessing\nits weights. We accomplish both, the removal of the requirement of a trusted\nmodel provider and a strong foundation for this interesting line of research.\nThus, our contribution is an additional layer of security on top of the strict\nprivate requirements of DP training in DL models, towards to DL models.\n","authors":[" Jiménez-López"," Daniel"," Rodríguez-Barroso"," Nuria"," Luzón","M. Victoria"," Herrera"," Francisco"],"pdf_url":"https://arxiv.org/pdf/2311.11717v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09191v3","updated":"2023-11-20T12:05:15Z","published":"2023-07-17T13:17:26Z","title":"A benchmark of categorical encoders for binary classification","summary":" Categorical encoders transform categorical features into numerical\nrepresentations that are indispensable for a wide range of machine learning\nmodels. Existing encoder benchmark studies lack generalizability because of\ntheir limited choice of (1) encoders, (2) experimental factors, and (3)\ndatasets. Additionally, inconsistencies arise from the adoption of varying\naggregation strategies. This paper is the most comprehensive benchmark of\ncategorical encoders to date, including an extensive evaluation of 32\nconfigurations of encoders from diverse families, with 36 combinations of\nexperimental factors, and on 50 datasets. The study shows the profound\ninfluence of dataset selection, experimental factors, and aggregation\nstrategies on the benchmark's conclusions -- aspects disregarded in previous\nencoder benchmarks.\n","authors":["Federico Matteucci","Vadim Arzamasov","Klemens Boehm"],"pdf_url":"https://arxiv.org/pdf/2307.09191v3.pdf","comment":"To be published in the 37th Conference on Neural Information\n Processing Systems (NeurIPS 2023) Track on Datasets and Benchmarks"},{"id":"http://arxiv.org/abs/2311.11696v1","updated":"2023-11-20T11:56:25Z","published":"2023-11-20T11:56:25Z","title":"Sparse Low-rank Adaptation of Pre-trained Language Models","summary":" Fine-tuning pre-trained large language models in a parameter-efficient manner\nis widely studied for its effectiveness and efficiency. The popular method of\nlow-rank adaptation (LoRA) offers a notable approach, hypothesizing that the\nadaptation process is intrinsically low-dimensional. Although LoRA has\ndemonstrated commendable performance, it is implemented with a fixed and\nunalterable intrinsic rank that might not always be the ideal choice.\nRecognizing the need for more flexible adaptation, we extend the methodology of\nLoRA to an innovative approach we call sparse low-rank adaptation (SoRA) that\nenables dynamic adjustments to the intrinsic rank during the adaptation\nprocess. We achieve this through the incorporation of a gate unit optimized\nwith proximal gradient method in the training stage, controlling the\ncardinality of rank under the sparsity of the gate. In the subsequent inference\nstage, we eliminate the parameter blocks corresponding to the zeroed-out ranks,\nto reduce each SoRA module back to a concise yet rank-optimal LoRA. Our\napproach strengthens the representation power of LoRA by initializing it with a\nhigher rank, while efficiently taming a temporarily increased number of\nparameters via updating in a sparse way. We further introduce a sparsifying\nscheduler for SoRA, aiming to examine the impact of the number of non-zero\nparameters on the model's memorization and generalization. Our experimental\nresults demonstrate that SoRA can outperform other baselines even with 70%\nretained parameters and 70% training time.\n","authors":["Ning Ding","Xingtai Lv","Qiaosen Wang","Yulin Chen","Bowen Zhou","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2311.11696v1.pdf","comment":"Accepted to EMNLP 2023 (Main Conference)"},{"id":"http://arxiv.org/abs/2205.04712v3","updated":"2023-11-20T11:54:28Z","published":"2022-05-10T07:25:32Z","title":"Knowledge Augmented Machine Learning with Applications in Autonomous\n Driving: A Survey","summary":" The availability of representative datasets is an essential prerequisite for\nmany successful artificial intelligence and machine learning models. However,\nin real life applications these models often encounter scenarios that are\ninadequately represented in the data used for training. There are various\nreasons for the absence of sufficient data, ranging from time and cost\nconstraints to ethical considerations. As a consequence, the reliable usage of\nthese models, especially in safety-critical applications, is still a tremendous\nchallenge. Leveraging additional, already existing sources of knowledge is key\nto overcome the limitations of purely data-driven approaches. Knowledge\naugmented machine learning approaches offer the possibility of compensating for\ndeficiencies, errors, or ambiguities in the data, thus increasing the\ngeneralization capability of the applied models. Even more, predictions that\nconform with knowledge are crucial for making trustworthy and safe decisions\neven in underrepresented scenarios. This work provides an overview of existing\ntechniques and methods in the literature that combine data-driven models with\nexisting knowledge. The identified approaches are structured according to the\ncategories knowledge integration, extraction and conformity. In particular, we\naddress the application of the presented methods in the field of autonomous\ndriving.\n","authors":["Julian Wörmann","Daniel Bogdoll","Christian Brunner","Etienne Bührle","Han Chen","Evaristus Fuh Chuo","Kostadin Cvejoski","Ludger van Elst","Philip Gottschall","Stefan Griesche","Christian Hellert","Christian Hesels","Sebastian Houben","Tim Joseph","Niklas Keil","Johann Kelsch","Mert Keser","Hendrik Königshof","Erwin Kraft","Leonie Kreuser","Kevin Krone","Tobias Latka","Denny Mattern","Stefan Matthes","Franz Motzkus","Mohsin Munir","Moritz Nekolla","Adrian Paschke","Stefan Pilar von Pilchau","Maximilian Alexander Pintz","Tianming Qiu","Faraz Qureishi","Syed Tahseen Raza Rizvi","Jörg Reichardt","Laura von Rueden","Alexander Sagel","Diogo Sasdelli","Tobias Scholl","Gerhard Schunk","Gesina Schwalbe","Hao Shen","Youssef Shoeb","Hendrik Stapelbroek","Vera Stehr","Gurucharan Srinivas","Anh Tuan Tran","Abhishek Vivekanandan","Ya Wang","Florian Wasserrab","Tino Werner","Christian Wirth","Stefan Zwicklbauer"],"pdf_url":"https://arxiv.org/pdf/2205.04712v3.pdf","comment":"111 pages, Added section on Run-time Network Verification"},{"id":"http://arxiv.org/abs/2311.11694v1","updated":"2023-11-20T11:48:50Z","published":"2023-11-20T11:48:50Z","title":"Unveiling the Power of Self-Attention for Shipping Cost Prediction: The\n Rate Card Transformer","summary":" Amazon ships billions of packages to its customers annually within the United\nStates. Shipping cost of these packages are used on the day of shipping (day 0)\nto estimate profitability of sales. Downstream systems utilize these days 0\nprofitability estimates to make financial decisions, such as pricing strategies\nand delisting loss-making products. However, obtaining accurate shipping cost\nestimates on day 0 is complex for reasons like delay in carrier invoicing or\nfixed cost components getting recorded at monthly cadence. Inaccurate shipping\ncost estimates can lead to bad decision, such as pricing items too low or high,\nor promoting the wrong product to the customers. Current solutions for\nestimating shipping costs on day 0 rely on tree-based models that require\nextensive manual engineering efforts. In this study, we propose a novel\narchitecture called the Rate Card Transformer (RCT) that uses self-attention to\nencode all package shipping information such as package attributes, carrier\ninformation and route plan. Unlike other transformer-based tabular models, RCT\nhas the ability to encode a variable list of one-to-many relations of a\nshipment, allowing it to capture more information about a shipment. For\nexample, RCT can encode properties of all products in a package. Our results\ndemonstrate that cost predictions made by the RCT have 28.82% less error\ncompared to tree-based GBDT model. Moreover, the RCT outperforms the\nstate-of-the-art transformer-based tabular model, FTTransformer, by 6.08%. We\nalso illustrate that the RCT learns a generalized manifold of the rate card\nthat can improve the performance of tree-based models.\n","authors":["P Aditya Sreekar","Sahil Verma","Varun Madhavan","Abhishek Persad"],"pdf_url":"https://arxiv.org/pdf/2311.11694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10348v2","updated":"2023-11-20T11:31:16Z","published":"2023-10-16T12:34:43Z","title":"Attribution Patching Outperforms Automated Circuit Discovery","summary":" Automated interpretability research has recently attracted attention as a\npotential research direction that could scale explanations of neural network\nbehavior to large models. Existing automated circuit discovery work applies\nactivation patching to identify subnetworks responsible for solving specific\ntasks (circuits). In this work, we show that a simple method based on\nattribution patching outperforms all existing methods while requiring just two\nforward passes and a backward pass. We apply a linear approximation to\nactivation patching to estimate the importance of each edge in the\ncomputational subgraph. Using this approximation, we prune the least important\nedges of the network. We survey the performance and limitations of this method,\nfinding that averaged over all tasks our method has greater AUC from circuit\nrecovery than other methods.\n","authors":["Aaquib Syed","Can Rager","Arthur Conmy"],"pdf_url":"https://arxiv.org/pdf/2310.10348v2.pdf","comment":"6 main paper pages, 6 additional pages. NeurIPS 2023 ATTRIB Workshop"},{"id":"http://arxiv.org/abs/2311.03197v2","updated":"2023-11-20T11:18:17Z","published":"2023-11-06T15:39:05Z","title":"Stable Linear Subspace Identification: A Machine Learning Approach","summary":" Machine Learning (ML) and linear System Identification (SI) have been\nhistorically developed independently. In this paper, we leverage\nwell-established ML tools - especially the automatic differentiation framework\n- to introduce SIMBa, a family of discrete linear multi-step-ahead state-space\nSI methods using backpropagation. SIMBa relies on a novel\nLinear-Matrix-Inequality-based free parametrization of Schur matrices to ensure\nthe stability of the identified model.\n We show how SIMBa generally outperforms traditional linear state-space SI\nmethods, and sometimes significantly, although at the price of a higher\ncomputational burden. This performance gap is particularly remarkable compared\nto other SI methods with stability guarantees, where the gain is frequently\nabove 25% in our investigations, hinting at SIMBa's ability to simultaneously\nachieve state-of-the-art fitting performance and enforce stability.\nInterestingly, these observations hold for a wide variety of input-output\nsystems and on both simulated and real-world data, showcasing the flexibility\nof the proposed approach. We postulate that this new SI paradigm presents a\ngreat extension potential to identify structured nonlinear models from data,\nand we hence open-source SIMBa on https://github.com/Cemempamoi/simba.\n","authors":["Loris Di Natale","Muhammad Zakwan","Bratislav Svetozarevic","Philipp Heer","Giancarlo Ferrari Trecate","Colin N. Jones"],"pdf_url":"https://arxiv.org/pdf/2311.03197v2.pdf","comment":"Submitted to ECC 2024"},{"id":"http://arxiv.org/abs/2307.13831v3","updated":"2023-11-20T10:57:01Z","published":"2023-07-25T21:59:17Z","title":"Relationship between Batch Size and Number of Steps Needed for Nonconvex\n Optimization of Stochastic Gradient Descent using Armijo Line Search","summary":" Stochastic gradient descent (SGD) is the simplest deep learning optimizer\nwith which to train deep neural networks. While SGD can use various learning\nrates, such as constant or diminishing rates, the previous numerical results\nshowed that SGD performs better than other deep learning optimizers using when\nit uses learning rates given by line search methods. In this paper, we perform\na convergence analysis on SGD with a learning rate given by an Armijo line\nsearch for nonconvex optimization. The analysis indicates that the upper bound\nof the expectation of the squared norm of the full gradient becomes small when\nthe number of steps and the batch size are large. Next, we show that, for SGD\nwith the Armijo-line-search learning rate, the number of steps needed for\nnonconvex optimization is a monotone decreasing convex function of the batch\nsize; that is, the number of steps needed for nonconvex optimization decreases\nas the batch size increases. Furthermore, we show that the stochastic\nfirst-order oracle (SFO) complexity, which is the stochastic gradient\ncomputation cost, is a convex function of the batch size; that is, there exists\na critical batch size that minimizes the SFO complexity. Finally, we provide\nnumerical results that support our theoretical results. The numerical results\nindicate that the number of steps needed for training deep neural networks\ndecreases as the batch size increases and that there exist the critical batch\nsizes that can be estimated from the theoretical results.\n","authors":["Yuki Tsukada","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2307.13831v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11644v1","updated":"2023-11-20T10:22:38Z","published":"2023-11-20T10:22:38Z","title":"Unraveling the Control Engineer's Craft with Neural Networks","summary":" Many industrial processes require suitable controllers to meet their\nperformance requirements. More often, a sophisticated digital twin is\navailable, which is a highly complex model that is a virtual representation of\na given physical process, whose parameters may not be properly tuned to capture\nthe variations in the physical process. In this paper, we present a sim2real,\ndirect data-driven controller tuning approach, where the digital twin is used\nto generate input-output data and suitable controllers for several\nperturbations in its parameters. State-of-the art neural-network architectures\nare then used to learn the controller tuning rule that maps input-output data\nonto the controller parameters, based on artificially generated data from\nperturbed versions of the digital twin. In this way, as far as we are aware, we\ntackle for the first time the problem of re-calibrating the controller by\nmeta-learning the tuning rule directly from data, thus practically replacing\nthe control engineer with a machine learning model. The benefits of this\nmethodology are illustrated via numerical simulations for several choices of\nneural-network architectures.\n","authors":["Braghadeesh Lakshminarayanan","Federico Dettù","Cristian R. Rojas","Simone Formentin"],"pdf_url":"https://arxiv.org/pdf/2311.11644v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2308.12634v2","updated":"2023-11-20T10:06:03Z","published":"2023-08-24T08:19:15Z","title":"Towards Hierarchical Regional Transformer-based Multiple Instance\n Learning","summary":" The classification of gigapixel histopathology images with deep multiple\ninstance learning models has become a critical task in digital pathology and\nprecision medicine. In this work, we propose a Transformer-based multiple\ninstance learning approach that replaces the traditional learned attention\nmechanism with a regional, Vision Transformer inspired self-attention\nmechanism. We present a method that fuses regional patch information to derive\nslide-level predictions and show how this regional aggregation can be stacked\nto hierarchically process features on different distance levels. To increase\npredictive accuracy, especially for datasets with small, local morphological\nfeatures, we introduce a method to focus the image processing on high attention\nregions during inference. Our approach is able to significantly improve\nperformance over the baseline on two histopathology datasets and points towards\npromising directions for further research.\n","authors":["Josef Cersovsky","Sadegh Mohammadi","Dagmar Kainmueller","Johannes Hoehne"],"pdf_url":"https://arxiv.org/pdf/2308.12634v2.pdf","comment":"8 pages, LaTeX; header update after published, fixed typos"},{"id":"http://arxiv.org/abs/2311.11629v1","updated":"2023-11-20T09:28:04Z","published":"2023-11-20T09:28:04Z","title":"Generating Realistic Counterfactuals for Retinal Fundus and OCT Images\n using Diffusion Models","summary":" Counterfactual reasoning is often used in a clinical setting to explain\ndecisions or weigh alternatives. Therefore, for imaging based modalities such\nas ophthalmology, it would be beneficial to be able to create counterfactual\nimages, illustrating the answer to the question: \"If the subject had had\ndiabetic retinopathy, how would the fundus image have looked?\" Here, we\ndemonstrate that using a diffusion model in combination with an adversarially\nrobust classifier trained on retinal disease classification tasks enables\ngeneration of highly realistic counterfactuals of retinal fundus images and\noptical coherence tomorgraphy (OCT) B-scans. Ideally, these classifiers encode\nthe salient features indicative for each disease class and can steer the\ndiffusion model to show realistic disease signs or remove disease-related\nlesions in a realistic way. Importantly, in a user study, domain experts found\nthe counterfactuals generated using our method significantly more realistic\nthan counterfactuals generated from a previous method, and even\nindistiguishable from realistic images.\n","authors":["Indu Ilanchezian","Valentyn Boreiko","Laura Kühlewein","Ziwei Huang","Murat Seçkin Ayhan","Matthias Hein","Lisa Koch","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2311.11629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11628v1","updated":"2023-11-20T09:27:09Z","published":"2023-11-20T09:27:09Z","title":"Incorporating LLM Priors into Tabular Learners","summary":" We present a method to integrate Large Language Models (LLMs) and traditional\ntabular data classification techniques, addressing LLMs challenges like data\nserialization sensitivity and biases. We introduce two strategies utilizing\nLLMs for ranking categorical variables and generating priors on correlations\nbetween continuous variables and targets, enhancing performance in few-shot\nscenarios. We focus on Logistic Regression, introducing MonotonicLR that\nemploys a non-linear monotonic function for mapping ordinals to cardinals while\npreserving LLM-determined orders. Validation against baseline models reveals\nthe superior performance of our approach, especially in low-data scenarios,\nwhile remaining interpretable.\n","authors":["Max Zhu","Siniša Stanivuk","Andrija Petrovic","Mladen Nikolic","Pietro Lio"],"pdf_url":"https://arxiv.org/pdf/2311.11628v1.pdf","comment":"Table Representation Learning Workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.11626v1","updated":"2023-11-20T09:20:26Z","published":"2023-11-20T09:20:26Z","title":"A novel transformer-based approach for soil temperature prediction","summary":" Soil temperature is one of the most significant parameters that plays a\ncrucial role in glacier energy, dynamics of mass balance, processes of surface\nhydrological, coaction of glacier-atmosphere, nutrient cycling, ecological\nstability, the management of soil, water, and field crop. In this work, we\nintroduce a novel approach using transformer models for the purpose of\nforecasting soil temperature prediction. To the best of our knowledge, the\nusage of transformer models in this work is the very first attempt to predict\nsoil temperature. Experiments are carried out using six different FLUXNET\nstations by modeling them with five different transformer models, namely,\nVanilla Transformer, Informer, Autoformer, Reformer, and ETSformer. To\ndemonstrate the effectiveness of the proposed model, experiment results are\ncompared with both deep learning approaches and literature studies. Experiment\nresults show that the utilization of transformer models ensures a significant\ncontribution to the literature, thence determining the new state-of-the-art.\n","authors":["Muhammet Mucahit Enes Yurtsever","Ayhan Kucukmanisa","Zeynep Hilal Kilimci"],"pdf_url":"https://arxiv.org/pdf/2311.11626v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2011.00789v2","updated":"2023-11-20T08:48:07Z","published":"2020-11-02T07:37:31Z","title":"Role Taxonomy of Units in Deep Neural Networks","summary":" Identifying the role of network units in deep neural networks (DNNs) is\ncritical in many aspects including giving understandings on the mechanisms of\nDNNs and building basic connections between deep learning and neuroscience.\nHowever, there remains unclear on which roles the units in DNNs with different\ngeneralization ability could present. To this end, we give role taxonomy of\nunits in DNNs via introducing the retrieval-of-function test, where units are\ncategorized into four types in terms of their functional preference on\nseparately the training set and testing set. We show that ratios of the four\ncategories are highly associated with the generalization ability of DNNs from\ntwo distinct perspectives, based on which we give signs of DNNs with well\ngeneralization.\n","authors":["Yang Zhao","Hao Zhang","Xiuyuan Hu"],"pdf_url":"https://arxiv.org/pdf/2011.00789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18415v3","updated":"2023-11-20T08:31:51Z","published":"2023-05-28T18:48:50Z","title":"Geometric Algebra Transformer","summary":" Problems involving geometric data arise in physics, chemistry, robotics,\ncomputer vision, and many other fields. Such data can take numerous forms, for\ninstance points, direction vectors, translations, or rotations, but to date\nthere is no single architecture that can be applied to such a wide variety of\ngeometric types while respecting their symmetries. In this paper we introduce\nthe Geometric Algebra Transformer (GATr), a general-purpose architecture for\ngeometric data. GATr represents inputs, outputs, and hidden states in the\nprojective geometric (or Clifford) algebra, which offers an efficient\n16-dimensional vector-space representation of common geometric objects as well\nas operators acting on them. GATr is equivariant with respect to E(3), the\nsymmetry group of 3D Euclidean space. As a Transformer, GATr is versatile,\nefficient, and scalable. We demonstrate GATr in problems from n-body modeling\nto wall-shear-stress estimation on large arterial meshes to robotic motion\nplanning. GATr consistently outperforms both non-geometric and equivariant\nbaselines in terms of error, data efficiency, and scalability.\n","authors":["Johann Brehmer","Pim de Haan","Sönke Behrends","Taco Cohen"],"pdf_url":"https://arxiv.org/pdf/2305.18415v3.pdf","comment":"Published at NeurIPS 2023, implementation available at\n https://github.com/qualcomm-ai-research/geometric-algebra-transformer . v3:\n matches camera-ready version"},{"id":"http://arxiv.org/abs/2302.06335v2","updated":"2023-11-20T08:03:40Z","published":"2023-02-13T13:12:55Z","title":"Online Arbitrary Shaped Clustering through Correlated Gaussian Functions","summary":" There is no convincing evidence that backpropagation is a biologically\nplausible mechanism, and further studies of alternative learning methods are\nneeded. A novel online clustering algorithm is presented that can produce\narbitrary shaped clusters from inputs in an unsupervised manner, and requires\nno prior knowledge of the number of clusters in the input data. This is\nachieved by finding correlated outputs from functions that capture commonly\noccurring input patterns. The algorithm can be deemed more biologically\nplausible than model optimization through backpropagation, although practical\napplicability may require additional research. However, the method yields\nsatisfactory results on several toy datasets on a noteworthy range of\nhyperparameters.\n","authors":["Ole Christian Eidheim"],"pdf_url":"https://arxiv.org/pdf/2302.06335v2.pdf","comment":"Corrected uniform distribution range; removed \"average\" from last\n sentence in section 4"},{"id":"http://arxiv.org/abs/2301.01333v2","updated":"2023-11-20T07:49:11Z","published":"2023-01-03T19:52:17Z","title":"oneDNN Graph Compiler: A Hybrid Approach for High-Performance Deep\n Learning Compilation","summary":" With the rapid development of deep learning models and hardware support for\ndense computing, the deep learning workload characteristics changed\nsignificantly from a few hot spots on compute-intensive operations to a broad\nrange of operations scattered across the models. Accelerating a few\ncompute-intensive operations using the expert-tuned implementation of\nprimitives does not fully exploit the performance potential of AI hardware.\nVarious efforts have been made to compile a full deep neural network (DNN)\ngraph. One of the biggest challenges is to achieve high-performance tensor\ncompilation by generating expert level performance code for the dense\ncompute-intensive operations and applying compilation optimization at the scope\nof DNN computation graph across multiple compute-intensive operations.\n We present oneDNN Graph Compiler, a tensor compiler that employs a hybrid\napproach of using techniques from both compiler optimization and expert-tuned\nkernels for high performance code generation of the deep neural network graph.\noneDNN Graph Compiler addresses unique optimization challenges in the deep\nlearning domain, such as low-precision computation, aggressive fusion of graph\noperations, optimization for static tensor shapes and memory layout, constant\nweight optimization, and memory buffer reuse. Experimental results demonstrate\nsignificant performance gains over existing tensor compiler and primitives\nlibrary for performance-critical DNN computation graphs and end-to-end models\non Intel Xeon Scalable Processors.\n","authors":["Jianhui Li","Zhennan Qin","Yijie Mei","Jingze Cui","Yunfei Song","Ciyong Chen","Yifei Zhang","Longsheng Du","Xianhang Cheng","Baihui Jin","Yan Zhang","Igor Safonov","Jason Ye","Eric Lin","Dan Lavery"],"pdf_url":"https://arxiv.org/pdf/2301.01333v2.pdf","comment":"10 pages excluding reference, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2311.11575v1","updated":"2023-11-20T07:19:52Z","published":"2023-11-20T07:19:52Z","title":"Testing multivariate normality by testing independence","summary":" We propose a simple multivariate normality test based on Kac-Bernstein's\ncharacterization, which can be conducted by utilising existing statistical\nindependence tests for sums and differences of data samples. We also perform\nits empirical investigation, which reveals that for high-dimensional data, the\nproposed approach may be more efficient than the alternative ones. The\naccompanying code repository is provided at \\url{https://shorturl.at/rtuy5}.\n","authors":["Povilas Daniušis"],"pdf_url":"https://arxiv.org/pdf/2311.11575v1.pdf","comment":"6 pages, 1 figure"},{"id":"http://arxiv.org/abs/2311.11558v1","updated":"2023-11-20T06:35:23Z","published":"2023-11-20T06:35:23Z","title":"A Deep-Genetic Algorithm (Deep-GA) Approach for High-Dimensional\n Nonlinear Parabolic Partial Differential Equations","summary":" We propose a new method, called a deep-genetic algorithm (deep-GA), to\naccelerate the performance of the so-called deep-BSDE method, which is a deep\nlearning algorithm to solve high dimensional partial differential equations\nthrough their corresponding backward stochastic differential equations (BSDEs).\nRecognizing the sensitivity of the solver to the initial guess selection, we\nembed a genetic algorithm (GA) into the solver to optimize the selection. We\naim to achieve faster convergence for the nonlinear PDEs on a broader interval\nthan deep-BSDE. Our proposed method is applied to two nonlinear parabolic PDEs,\ni.e., the Black-Scholes (BS) equation with default risk and the\nHamilton-Jacobi-Bellman (HJB) equation. We compare the results of our method\nwith those of the deep-BSDE and show that our method provides comparable\naccuracy with significantly improved computational efficiency.\n","authors":["Endah Rokhmati Merdika Putri","Muhammad Luthfi Shahab","Mohammad Iqbal","Imam Mukhlash","Amirul Hakam","Lutfi Mardianto","Hadi Susanto"],"pdf_url":"https://arxiv.org/pdf/2311.11558v1.pdf","comment":"Accepted for publication in Computers and Mathematics with\n Applications, 19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.11557v1","updated":"2023-11-20T06:21:52Z","published":"2023-11-20T06:21:52Z","title":"Replay-enhanced Continual Reinforcement Learning","summary":" Replaying past experiences has proven to be a highly effective approach for\naverting catastrophic forgetting in supervised continual learning. However,\nsome crucial factors are still largely ignored, making it vulnerable to serious\nfailure, when used as a solution to forgetting in continual reinforcement\nlearning, even in the context of perfect memory where all data of previous\ntasks are accessible in the current task. On the one hand, since most\nreinforcement learning algorithms are not invariant to the reward scale, the\npreviously well-learned tasks (with high rewards) may appear to be more salient\nto the current learning process than the current task (with small initial\nrewards). This causes the agent to concentrate on those salient tasks at the\nexpense of generality on the current task. On the other hand, offline learning\non replayed tasks while learning a new task may induce a distributional shift\nbetween the dataset and the learned policy on old tasks, resulting in\nforgetting. In this paper, we introduce RECALL, a replay-enhanced method that\ngreatly improves the plasticity of existing replay-based methods on new tasks\nwhile effectively avoiding the recurrence of catastrophic forgetting in\ncontinual reinforcement learning. RECALL leverages adaptive normalization on\napproximate targets and policy distillation on old tasks to enhance generality\nand stability, respectively. Extensive experiments on the Continual World\nbenchmark show that RECALL performs significantly better than purely perfect\nmemory replay, and achieves comparable or better overall performance against\nstate-of-the-art continual learning methods.\n","authors":["Tiantian Zhang","Kevin Zehua Shen","Zichuan Lin","Bo Yuan","Xueqian Wang","Xiu Li","Deheng Ye"],"pdf_url":"https://arxiv.org/pdf/2311.11557v1.pdf","comment":"Accepted by Transactions on Machine Learning Research 2023"},{"id":"http://arxiv.org/abs/2310.03358v2","updated":"2023-11-20T06:08:28Z","published":"2023-10-05T07:29:29Z","title":"Enhancing Robust Representation in Adversarial Training: Alignment and\n Exclusion Criteria","summary":" Deep neural networks are vulnerable to adversarial noise. Adversarial\nTraining (AT) has been demonstrated to be the most effective defense strategy\nto protect neural networks from being fooled. However, we find AT omits to\nlearning robust features, resulting in poor performance of adversarial\nrobustness. To address this issue, we highlight two criteria of robust\nrepresentation: (1) Exclusion: \\emph{the feature of examples keeps away from\nthat of other classes}; (2) Alignment: \\emph{the feature of natural and\ncorresponding adversarial examples is close to each other}. These motivate us\nto propose a generic framework of AT to gain robust representation, by the\nasymmetric negative contrast and reverse attention. Specifically, we design an\nasymmetric negative contrast based on predicted probabilities, to push away\nexamples of different classes in the feature space. Moreover, we propose to\nweight feature by parameters of the linear classifier as the reverse attention,\nto obtain class-aware feature and pull close the feature of the same class.\nEmpirical evaluations on three benchmark datasets show our methods greatly\nadvance the robustness of AT and achieve state-of-the-art performance.\n","authors":["Nuoyan Zhou","Nannan Wang","Decheng Liu","Dawei Zhou","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2310.03358v2.pdf","comment":"10 pages, 9 figures, Submitted to TIFS"},{"id":"http://arxiv.org/abs/2311.11552v1","updated":"2023-11-20T06:06:22Z","published":"2023-11-20T06:06:22Z","title":"Exploring Prompting Large Language Models as Explainable Metrics","summary":" This paper describes the IUST NLP Lab submission to the Prompting Large\nLanguage Models as Explainable Metrics Shared Task at the Eval4NLP 2023\nWorkshop on Evaluation & Comparison of NLP Systems. We have proposed a\nzero-shot prompt-based strategy for explainable evaluation of the summarization\ntask using Large Language Models (LLMs). The conducted experiments demonstrate\nthe promising potential of LLMs as evaluation metrics in Natural Language\nProcessing (NLP), particularly in the field of summarization. Both few-shot and\nzero-shot approaches are employed in these experiments. The performance of our\nbest provided prompts achieved a Kendall correlation of 0.477 with human\nevaluations in the text summarization task on the test data. Code and results\nare publicly available on GitHub.\n","authors":["Ghazaleh Mahmoudi"],"pdf_url":"https://arxiv.org/pdf/2311.11552v1.pdf","comment":"9 pages, Eval4NLP 2023"},{"id":"http://arxiv.org/abs/2311.11544v1","updated":"2023-11-20T05:35:40Z","published":"2023-11-20T05:35:40Z","title":"Understanding Variation in Subpopulation Susceptibility to Poisoning\n Attacks","summary":" Machine learning is susceptible to poisoning attacks, in which an attacker\ncontrols a small fraction of the training data and chooses that data with the\ngoal of inducing some behavior unintended by the model developer in the trained\nmodel. We consider a realistic setting in which the adversary with the ability\nto insert a limited number of data points attempts to control the model's\nbehavior on a specific subpopulation. Inspired by previous observations on\ndisparate effectiveness of random label-flipping attacks on different\nsubpopulations, we investigate the properties that can impact the effectiveness\nof state-of-the-art poisoning attacks against different subpopulations. For a\nfamily of 2-dimensional synthetic datasets, we empirically find that dataset\nseparability plays a dominant role in subpopulation vulnerability for less\nseparable datasets. However, well-separated datasets exhibit more dependence on\nindividual subpopulation properties. We further discover that a crucial\nsubpopulation property is captured by the difference in loss on the clean\ndataset between the clean model and a target model that misclassifies the\nsubpopulation, and a subpopulation is much easier to attack if the loss\ndifference is small. This property also generalizes to high-dimensional\nbenchmark datasets. For the Adult benchmark dataset, we show that we can find\nsemantically-meaningful subpopulation properties that are related to the\nsusceptibilities of a selected group of subpopulations. The results in this\npaper are accompanied by a fully interactive web-based visualization of\nsubpopulation poisoning attacks found at\nhttps://uvasrg.github.io/visualizing-poisoning\n","authors":["Evan Rose","Fnu Suya","David Evans"],"pdf_url":"https://arxiv.org/pdf/2311.11544v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2211.10890v4","updated":"2023-11-20T05:25:23Z","published":"2022-11-20T07:18:56Z","title":"Single-Pass Contrastive Learning Can Work for Both Homophilic and\n Heterophilic Graph","summary":" Existing graph contrastive learning (GCL) techniques typically require two\nforward passes for a single instance to construct the contrastive loss, which\nis effective for capturing the low-frequency signals of node features. Such a\ndual-pass design has shown empirical success on homophilic graphs, but its\neffectiveness on heterophilic graphs, where directly connected nodes typically\nhave different labels, is unknown. In addition, existing GCL approaches fail to\nprovide strong performance guarantees. Coupled with the unpredictability of GCL\napproaches on heterophilic graphs, their applicability in real-world contexts\nis limited. Then, a natural question arises: Can we design a GCL method that\nworks for both homophilic and heterophilic graphs with a performance guarantee?\nTo answer this question, we theoretically study the concentration property of\nfeatures obtained by neighborhood aggregation on homophilic and heterophilic\ngraphs, introduce the single-pass augmentation-free graph contrastive learning\nloss based on the property, and provide performance guarantees for the\nminimizer of the loss on downstream tasks. As a direct consequence of our\nanalysis, we implement the Single-Pass Graph Contrastive Learning method\n(SP-GCL). Empirically, on 14 benchmark datasets with varying degrees of\nhomophily, the features learned by the SP-GCL can match or outperform existing\nstrong baselines with significantly less computational overhead, which\ndemonstrates the usefulness of our findings in real-world cases.\n","authors":["Haonan Wang","Jieyu Zhang","Qi Zhu","Wei Huang","Kenji Kawaguchi","Xiaokui Xiao"],"pdf_url":"https://arxiv.org/pdf/2211.10890v4.pdf","comment":"This article has been accepted for publication by the Transactions on\n Machine Learning Research. OpenReview at:\n https://openreview.net/forum?id=244KePn09i"},{"id":"http://arxiv.org/abs/2310.15516v2","updated":"2023-11-20T05:06:11Z","published":"2023-10-24T04:50:32Z","title":"Graph Attention-based Deep Reinforcement Learning for solving the\n Chinese Postman Problem with Load-dependent costs","summary":" Recently, Deep reinforcement learning (DRL) models have shown promising\nresults in solving routing problems. However, most DRL solvers are commonly\nproposed to solve node routing problems, such as the Traveling Salesman Problem\n(TSP). Meanwhile, there has been limited research on applying neural methods to\narc routing problems, such as the Chinese Postman Problem (CPP), since they\noften feature irregular and complex solution spaces compared to TSP. To fill\nthese gaps, this paper proposes a novel DRL framework to address the CPP with\nload-dependent costs (CPP-LC) (Corberan et al., 2018), which is a complex arc\nrouting problem with load constraints. The novelty of our method is two-fold.\nFirst, we formulate the CPP-LC as a Markov Decision Process (MDP) sequential\nmodel. Subsequently, we introduce an autoregressive model based on DRL, namely\nArc-DRL, consisting of an encoder and decoder to address the CPP-LC challenge\neffectively. Such a framework allows the DRL model to work efficiently and\nscalably to arc routing problems. Furthermore, we propose a new bio-inspired\nmeta-heuristic solution based on Evolutionary Algorithm (EA) for CPP-LC.\nExtensive experiments show that Arc-DRL outperforms existing meta-heuristic\nmethods such as Iterative Local Search (ILS) and Variable Neighborhood Search\n(VNS) proposed by (Corberan et al., 2018) on large benchmark datasets for\nCPP-LC regarding both solution quality and running time; while the EA gives the\nbest solution quality with much more running time. We release our C++\nimplementations for metaheuristics such as EA, ILS and VNS along with the code\nfor data generation and our generated data at\nhttps://github.com/HySonLab/Chinese_Postman_Problem\n","authors":["Truong Son Hy","Cong Dao Tran"],"pdf_url":"https://arxiv.org/pdf/2310.15516v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11537v1","updated":"2023-11-20T04:54:51Z","published":"2023-11-20T04:54:51Z","title":"ADAPTER-RL: Adaptation of Any Agent using Reinforcement Learning","summary":" Deep Reinforcement Learning (DRL) agents frequently face challenges in\nadapting to tasks outside their training distribution, including issues with\nover-fitting, catastrophic forgetting and sample inefficiency. Although the\napplication of adapters has proven effective in supervised learning contexts\nsuch as natural language processing and computer vision, their potential within\nthe DRL domain remains largely unexplored. This paper delves into the\nintegration of adapters in reinforcement learning, presenting an innovative\nadaptation strategy that demonstrates enhanced training efficiency and\nimprovement of the base-agent, experimentally in the nanoRTS environment, a\nreal-time strategy (RTS) game simulation. Our proposed universal approach is\nnot only compatible with pre-trained neural networks but also with rule-based\nagents, offering a means to integrate human expertise.\n","authors":["Yizhao Jin","Greg Slabaugh","Simon Lucas"],"pdf_url":"https://arxiv.org/pdf/2311.11537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13335v3","updated":"2023-11-20T04:52:36Z","published":"2023-02-26T15:40:09Z","title":"Diffusion Model-Augmented Behavioral Cloning","summary":" Imitation learning addresses the challenge of learning by observing an\nexpert's demonstrations without access to reward signals from environments.\nMost existing imitation learning methods that do not require interacting with\nenvironments either model the expert distribution as the conditional\nprobability p(a|s) (e.g., behavioral cloning, BC) or the joint probability p(s,\na). Despite its simplicity, modeling the conditional probability with BC\nusually struggles with generalization. While modeling the joint probability can\nlead to improved generalization performance, the inference procedure is often\ntime-consuming and the model can suffer from manifold overfitting. This work\nproposes an imitation learning framework that benefits from modeling both the\nconditional and joint probability of the expert distribution. Our proposed\ndiffusion model-augmented behavioral cloning (DBC) employs a diffusion model\ntrained to model expert behaviors and learns a policy to optimize both the BC\nloss (conditional) and our proposed diffusion model loss (joint). DBC\noutperforms baselines in various continuous control tasks in navigation, robot\narm manipulation, dexterous manipulation, and locomotion. We design additional\nexperiments to verify the limitations of modeling either the conditional\nprobability or the joint probability of the expert distribution as well as\ncompare different generative models. Ablation studies justify the effectiveness\nof our design choices.\n","authors":["Hsiang-Chun Wang","Shang-Fu Chen","Ming-Hao Hsu","Chun-Mao Lai","Shao-Hua Sun"],"pdf_url":"https://arxiv.org/pdf/2302.13335v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17713v2","updated":"2023-11-20T04:36:29Z","published":"2023-03-30T21:16:44Z","title":"Mitigating Source Bias for Fairer Weak Supervision","summary":" Weak supervision enables efficient development of training sets by reducing\nthe need for ground truth labels. However, the techniques that make weak\nsupervision attractive -- such as integrating any source of signal to estimate\nunknown labels -- also entail the danger that the produced pseudolabels are\nhighly biased. Surprisingly, given everyday use and the potential for increased\nbias, weak supervision has not been studied from the point of view of fairness.\nWe begin such a study, starting with the observation that even when a fair\nmodel can be built from a dataset with access to ground-truth labels, the\ncorresponding dataset labeled via weak supervision can be arbitrarily unfair.\nTo address this, we propose and empirically validate a model for source\nunfairness in weak supervision, then introduce a simple counterfactual\nfairness-based technique that can mitigate these biases. Theoretically, we show\nthat it is possible for our approach to simultaneously improve both accuracy\nand fairness -- in contrast to standard fairness approaches that suffer from\ntradeoffs. Empirically, we show that our technique improves accuracy on weak\nsupervision baselines by as much as 32\\% while reducing demographic parity gap\nby 82.5\\%. A simple extension of our method aimed at maximizing performance\nproduces state-of-the-art performance in five out of ten datasets in the WRENCH\nbenchmark.\n","authors":["Changho Shin","Sonia Cromp","Dyah Adila","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2303.17713v2.pdf","comment":"42 pages"},{"id":"http://arxiv.org/abs/2311.11532v1","updated":"2023-11-20T04:34:19Z","published":"2023-11-20T04:34:19Z","title":"Optimal Hyperparameter $ε$ for Adaptive Stochastic Optimizers\n through Gradient Histograms","summary":" Optimizers are essential components for successfully training deep neural\nnetwork models. In order to achieve the best performance from such models,\ndesigners need to carefully choose the optimizer hyperparameters. However, this\ncan be a computationally expensive and time-consuming process. Although it is\nknown that all optimizer hyperparameters must be tuned for maximum performance,\nthere is still a lack of clarity regarding the individual influence of minor\npriority hyperparameters, including the safeguard factor $\\epsilon$ and\nmomentum factor $\\beta$, in leading adaptive optimizers (specifically, those\nbased on the Adam optimizers). In this manuscript, we introduce a new framework\nbased on gradient histograms to analyze and justify important attributes of\nadaptive optimizers, such as their optimal performance and the relationships\nand dependencies among hyperparameters. Furthermore, we propose a novel\ngradient histogram-based algorithm that automatically estimates a reduced and\naccurate search space for the safeguard hyperparameter $\\epsilon$, where the\noptimal value can be easily found.\n","authors":["Gustavo Silva","Paul Rodriguez"],"pdf_url":"https://arxiv.org/pdf/2311.11532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.15439v2","updated":"2023-11-20T04:31:13Z","published":"2022-05-30T21:34:40Z","title":"StyleTTS: A Style-Based Generative Model for Natural and Diverse\n Text-to-Speech Synthesis","summary":" Text-to-Speech (TTS) has recently seen great progress in synthesizing\nhigh-quality speech owing to the rapid development of parallel TTS systems, but\nproducing speech with naturalistic prosodic variations, speaking styles and\nemotional tones remains challenging. Moreover, since duration and speech are\ngenerated separately, parallel TTS models still have problems finding the best\nmonotonic alignments that are crucial for naturalistic speech synthesis. Here,\nwe propose StyleTTS, a style-based generative model for parallel TTS that can\nsynthesize diverse speech with natural prosody from a reference speech\nutterance. With novel Transferable Monotonic Aligner (TMA) and\nduration-invariant data augmentation schemes, our method significantly\noutperforms state-of-the-art models on both single and multi-speaker datasets\nin subjective tests of speech naturalness and speaker similarity. Through\nself-supervised learning of the speaking styles, our model can synthesize\nspeech with the same prosodic and emotional tone as any given reference speech\nwithout the need for explicitly labeling these categories.\n","authors":["Yinghao Aaron Li","Cong Han","Nima Mesgarani"],"pdf_url":"https://arxiv.org/pdf/2205.15439v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07691v2","updated":"2023-11-20T04:23:08Z","published":"2023-06-13T11:04:43Z","title":"StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion\n and Adversarial Training with Large Speech Language Models","summary":" In this paper, we present StyleTTS 2, a text-to-speech (TTS) model that\nleverages style diffusion and adversarial training with large speech language\nmodels (SLMs) to achieve human-level TTS synthesis. StyleTTS 2 differs from its\npredecessor by modeling styles as a latent random variable through diffusion\nmodels to generate the most suitable style for the text without requiring\nreference speech, achieving efficient latent diffusion while benefiting from\nthe diverse speech synthesis offered by diffusion models. Furthermore, we\nemploy large pre-trained SLMs, such as WavLM, as discriminators with our novel\ndifferentiable duration modeling for end-to-end training, resulting in improved\nspeech naturalness. StyleTTS 2 surpasses human recordings on the single-speaker\nLJSpeech dataset and matches it on the multispeaker VCTK dataset as judged by\nnative English speakers. Moreover, when trained on the LibriTTS dataset, our\nmodel outperforms previous publicly available models for zero-shot speaker\nadaptation. This work achieves the first human-level TTS on both single and\nmultispeaker datasets, showcasing the potential of style diffusion and\nadversarial training with large SLMs. The audio demos and source code are\navailable at https://styletts2.github.io/.\n","authors":["Yinghao Aaron Li","Cong Han","Vinay S. Raghavan","Gavin Mischler","Nima Mesgarani"],"pdf_url":"https://arxiv.org/pdf/2306.07691v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.11520v1","updated":"2023-11-20T03:51:39Z","published":"2023-11-20T03:51:39Z","title":"Liver Tumor Prediction with Advanced Attention Mechanisms Integrated\n into a Depth-Based Variant Search Algorithm","summary":" In recent days, Deep Learning (DL) techniques have become an emerging\ntransformation in the field of machine learning, artificial intelligence,\ncomputer vision, and so on. Subsequently, researchers and industries have been\nhighly endorsed in the medical field, predicting and controlling diverse\ndiseases at specific intervals. Liver tumor prediction is a vital chore in\nanalyzing and treating liver diseases. This paper proposes a novel approach for\npredicting liver tumors using Convolutional Neural Networks (CNN) and a\ndepth-based variant search algorithm with advanced attention mechanisms\n(CNN-DS-AM). The proposed work aims to improve accuracy and robustness in\ndiagnosing and treating liver diseases. The anticipated model is assessed on a\nComputed Tomography (CT) scan dataset containing both benign and malignant\nliver tumors. The proposed approach achieved high accuracy in predicting liver\ntumors, outperforming other state-of-the-art methods. Additionally, advanced\nattention mechanisms were incorporated into the CNN model to enable the\nidentification and highlighting of regions of the CT scans most relevant to\npredicting liver tumors. The results suggest that incorporating attention\nmechanisms and a depth-based variant search algorithm into the CNN model is a\npromising approach for improving the accuracy and robustness of liver tumor\nprediction. It can assist radiologists in their diagnosis and treatment\nplanning. The proposed system achieved a high accuracy of 95.5% in predicting\nliver tumors, outperforming other state-of-the-art methods.\n","authors":["P. Kalaiselvi","S. Anusuya"],"pdf_url":"https://arxiv.org/pdf/2311.11520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11518v1","updated":"2023-11-20T03:44:32Z","published":"2023-11-20T03:44:32Z","title":"Multi-teacher Distillation for Multilingual Spelling Correction","summary":" Accurate spelling correction is a critical step in modern search interfaces,\nespecially in an era of mobile devices and speech-to-text interfaces. For\nservices that are deployed around the world, this poses a significant challenge\nfor multilingual NLP: spelling errors need to be caught and corrected in all\nlanguages, and even in queries that use multiple languages. In this paper, we\ntackle this challenge using multi-teacher distillation. On our approach, a\nmonolingual teacher model is trained for each language/locale, and these\nindividual models are distilled into a single multilingual student model\nintended to serve all languages/locales. In experiments using open-source data\nas well as user data from a worldwide search service, we show that this leads\nto highly effective spelling correction models that can meet the tight latency\nrequirements of deployed services.\n","authors":["Jingfen Zhang","Xuan Guo","Sravan Bodapati","Christopher Potts"],"pdf_url":"https://arxiv.org/pdf/2311.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20258v3","updated":"2023-11-20T03:43:31Z","published":"2023-10-31T08:24:41Z","title":"Advancing Bayesian Optimization via Learning Correlated Latent Space","summary":" Bayesian optimization is a powerful method for optimizing black-box functions\nwith limited function evaluations. Recent works have shown that optimization in\na latent space through deep generative models such as variational autoencoders\nleads to effective and efficient Bayesian optimization for structured or\ndiscrete data. However, as the optimization does not take place in the input\nspace, it leads to an inherent gap that results in potentially suboptimal\nsolutions. To alleviate the discrepancy, we propose Correlated latent space\nBayesian Optimization (CoBO), which focuses on learning correlated latent\nspaces characterized by a strong correlation between the distances in the\nlatent space and the distances within the objective function. Specifically, our\nmethod introduces Lipschitz regularization, loss weighting, and trust region\nrecoordination to minimize the inherent gap around the promising areas. We\ndemonstrate the effectiveness of our approach on several optimization tasks in\ndiscrete data, such as molecule design and arithmetic expression fitting, and\nachieve high performance within a small budget.\n","authors":["Seunghun Lee","Jaewon Chu","Sihyeon Kim","Juyeon Ko","Hyunwoo J. Kim"],"pdf_url":"https://arxiv.org/pdf/2310.20258v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06657v3","updated":"2023-11-20T03:39:38Z","published":"2023-05-11T08:52:09Z","title":"On Practical Robust Reinforcement Learning: Practical Uncertainty Set\n and Double-Agent Algorithm","summary":" Robust reinforcement learning (RRL) aims at seeking a robust policy to\noptimize the worst case performance over an uncertainty set of Markov decision\nprocesses (MDPs). This set contains some perturbed MDPs from a nominal MDP\n(N-MDP) that generate samples for training, which reflects some potential\nmismatches between training (i.e., N-MDP) and true environments. In this paper\nwe present an elaborated uncertainty set by excluding some implausible MDPs\nfrom the existing sets. Under this uncertainty set, we develop a sample-based\nRRL algorithm (named ARQ-Learning) for tabular setting and characterize its\nfinite-time error bound. Also, it is proved that ARQ-Learning converges as fast\nas the standard Q-Learning and robust Q-Learning while ensuring better\nrobustness. We introduce an additional pessimistic agent which can tackle the\nmajor bottleneck for the extension of ARQ-Learning into the cases with larger\nor continuous state spaces. Incorporating this idea into RL algorithms, we\npropose double-agent algorithms for model-free RRL. Via experiments, we\ndemonstrate the effectiveness of the proposed algorithms.\n","authors":["Ukjo Hwang","Songnam Hong"],"pdf_url":"https://arxiv.org/pdf/2305.06657v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11509v1","updated":"2023-11-20T03:17:21Z","published":"2023-11-20T03:17:21Z","title":"Token-Level Adversarial Prompt Detection Based on Perplexity Measures\n and Contextual Information","summary":" In recent years, Large Language Models (LLM) have emerged as pivotal tools in\nvarious applications. However, these models are susceptible to adversarial\nprompt attacks, where attackers can carefully curate input strings that lead to\nundesirable outputs. The inherent vulnerability of LLMs stems from their\ninput-output mechanisms, especially when presented with intensely\nout-of-distribution (OOD) inputs. This paper proposes a token-level detection\nmethod to identify adversarial prompts, leveraging the LLM's capability to\npredict the next token's probability. We measure the degree of the model's\nperplexity and incorporate neighboring token information to encourage the\ndetection of contiguous adversarial prompt sequences. As a result, we propose\ntwo methods: one that identifies each token as either being part of an\nadversarial prompt or not, and another that estimates the probability of each\ntoken being part of an adversarial prompt.\n","authors":["Zhengmian Hu","Gang Wu","Saayan Mitra","Ruiyi Zhang","Tong Sun","Heng Huang","Vishy Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2311.11509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19909v2","updated":"2023-11-20T03:05:50Z","published":"2023-10-30T18:23:58Z","title":"Battle of the Backbones: A Large-Scale Comparison of Pretrained Models\n across Computer Vision Tasks","summary":" Neural network based computer vision systems are typically built on a\nbackbone, a pretrained or randomly initialized feature extractor. Several years\nago, the default option was an ImageNet-trained convolutional neural network.\nHowever, the recent past has seen the emergence of countless backbones\npretrained using various algorithms and datasets. While this abundance of\nchoice has led to performance increases for a range of systems, it is difficult\nfor practitioners to make informed decisions about which backbone to choose.\nBattle of the Backbones (BoB) makes this choice easier by benchmarking a\ndiverse suite of pretrained models, including vision-language models, those\ntrained via self-supervised learning, and the Stable Diffusion backbone, across\na diverse set of computer vision tasks ranging from classification to object\ndetection to OOD generalization and more. Furthermore, BoB sheds light on\npromising directions for the research community to advance computer vision by\nilluminating strengths and weakness of existing approaches through a\ncomprehensive analysis conducted on more than 1500 training runs. While vision\ntransformers (ViTs) and self-supervised learning (SSL) are increasingly\npopular, we find that convolutional neural networks pretrained in a supervised\nfashion on large training sets still perform best on most tasks among the\nmodels we consider. Moreover, in apples-to-apples comparisons on the same\narchitectures and similarly sized pretraining datasets, we find that SSL\nbackbones are highly competitive, indicating that future works should perform\nSSL pretraining with advanced architectures and larger pretraining datasets. We\nrelease the raw results of our experiments along with code that allows\nresearchers to put their own backbones through the gauntlet here:\nhttps://github.com/hsouri/Battle-of-the-Backbones\n","authors":["Micah Goldblum","Hossein Souri","Renkun Ni","Manli Shu","Viraj Prabhu","Gowthami Somepalli","Prithvijit Chattopadhyay","Mark Ibrahim","Adrien Bardes","Judy Hoffman","Rama Chellappa","Andrew Gordon Wilson","Tom Goldstein"],"pdf_url":"https://arxiv.org/pdf/2310.19909v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.11501v1","updated":"2023-11-20T02:59:18Z","published":"2023-11-20T02:59:18Z","title":"MultiLoRA: Democratizing LoRA for Better Multi-Task Learning","summary":" LoRA achieves remarkable resource efficiency and comparable performance when\nadapting LLMs for specific tasks. Since ChatGPT demonstrated superior\nperformance on various tasks, there has been a growing desire to adapt one\nmodel for all tasks. However, the explicit low-rank of LoRA limits the\nadaptation performance in complex multi-task scenarios. LoRA is dominated by a\nsmall number of top singular vectors while fine-tuning decomposes into a set of\nless important unitary transforms. In this paper, we propose MultiLoRA for\nbetter multi-task adaptation by reducing the dominance of top singular vectors\nobserved in LoRA. MultiLoRA scales LoRA modules horizontally and change\nparameter initialization of adaptation matrices to reduce parameter dependency,\nthus yields more balanced unitary subspaces. We unprecedentedly construct\nspecialized training data by mixing datasets of instruction follow, natural\nlanguage understanding, world knowledge, to cover semantically and\nsyntactically different samples. With only 2.5% of additional parameters,\nMultiLoRA outperforms single LoRA counterparts and fine-tuning on multiple\nbenchmarks and model scales. Further investigation into weight update matrices\nof MultiLoRA exhibits reduced dependency on top singular vectors and more\ndemocratic unitary transform contributions.\n","authors":["Yiming Wang","Yu Lin","Xiaodong Zeng","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.11501v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11491v1","updated":"2023-11-20T02:31:08Z","published":"2023-11-20T02:31:08Z","title":"Interpretability in Machine Learning: on the Interplay with\n Explainability, Predictive Performances and Models","summary":" Interpretability has recently gained attention in the field of machine\nlearning, for it is crucial when it comes to high-stakes decisions or\ntroubleshooting. This abstract concept is hard to grasp and has been\nassociated, over time, with many labels and preconceived ideas. In this\nposition paper, in order to clarify some misunderstandings regarding\ninterpretability, we discuss its relationship with significant concepts in\nmachine learning: explainability, predictive performances, and machine learning\nmodels. For instance, we challenge the idea that interpretability and\nexplainability are substitutes to one another, or that a fixed degree of\ninterpretability can be associated with a given machine learning model.\n","authors":["Benjamin Leblanc","Pascal Germain"],"pdf_url":"https://arxiv.org/pdf/2311.11491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09952v2","updated":"2023-11-20T02:14:51Z","published":"2023-08-19T09:12:47Z","title":"Finding emergence in data: causal emergence inspired dynamics learning","summary":" Modelling complex dynamical systems in a data-driven manner is challenging\ndue to the presence of emergent behaviors and properties that cannot be\ndirectly captured by micro-level observational data. Therefore, it is crucial\nto develop a model that can effectively capture emergent dynamics at the\nmacro-level and quantify emergence based on the available data. Drawing\ninspiration from the theory of causal emergence, this paper introduces a\nmachine learning framework aimed at learning macro-dynamics within an emergent\nlatent space. The framework achieves this by maximizing the effective\ninformation (EI) to obtain a macro-dynamics model with stronger causal effects.\nExperimental results on both simulated and real data demonstrate the\neffectiveness of the proposed framework. Not only does it successfully capture\nemergent patterns, but it also learns the coarse-graining strategy and\nquantifies the degree of causal emergence in the data. Furthermore, experiments\nconducted on environments different from the training dataset highlight the\nsuperior generalization ability of our model.\n","authors":["Mingzhe Yang","Zhipeng Wang","Kaiwei Liu","Yingqi Rong","Bing Yuan","Jiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.09952v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09574v3","updated":"2023-11-20T02:01:33Z","published":"2023-11-16T05:17:14Z","title":"LymphoML: An interpretable artificial intelligence-based method\n identifies morphologic features that correlate with lymphoma subtype","summary":" The accurate classification of lymphoma subtypes using hematoxylin and eosin\n(H&E)-stained tissue is complicated by the wide range of morphological features\nthese cancers can exhibit. We present LymphoML - an interpretable machine\nlearning method that identifies morphologic features that correlate with\nlymphoma subtypes. Our method applies steps to process H&E-stained tissue\nmicroarray cores, segment nuclei and cells, compute features encompassing\nmorphology, texture, and architecture, and train gradient-boosted models to\nmake diagnostic predictions. LymphoML's interpretable models, developed on a\nlimited volume of H&E-stained tissue, achieve non-inferior diagnostic accuracy\nto pathologists using whole-slide images and outperform black box deep-learning\non a dataset of 670 cases from Guatemala spanning 8 lymphoma subtypes. Using\nSHapley Additive exPlanation (SHAP) analysis, we assess the impact of each\nfeature on model prediction and find that nuclear shape features are most\ndiscriminative for DLBCL (F1-score: 78.7%) and classical Hodgkin lymphoma\n(F1-score: 74.5%). Finally, we provide the first demonstration that a model\ncombining features from H&E-stained tissue with features from a standardized\npanel of 6 immunostains results in a similar diagnostic accuracy (85.3%) to a\n46-stain panel (86.1%).\n","authors":["Vivek Shankar","Xiaoli Yang","Vrishab Krishna","Brent Tan","Oscar Silva","Rebecca Rojansky","Andrew Ng","Fabiola Valvert","Edward Briercheck","David Weinstock","Yasodha Natkunam","Sebastian Fernandez-Pol","Pranav Rajpurkar"],"pdf_url":"https://arxiv.org/pdf/2311.09574v3.pdf","comment":"To be published in Proceedings of the 3rd Machine Learning for Health\n symposium, Proceedings of Machine Learning Research (PMLR)"},{"id":"http://arxiv.org/abs/2311.11485v1","updated":"2023-11-20T02:00:33Z","published":"2023-11-20T02:00:33Z","title":"An NMF-Based Building Block for Interpretable Neural Networks With\n Continual Learning","summary":" Existing learning methods often struggle to balance interpretability and\npredictive performance. While models like nearest neighbors and non-negative\nmatrix factorization (NMF) offer high interpretability, their predictive\nperformance on supervised learning tasks is often limited. In contrast, neural\nnetworks based on the multi-layer perceptron (MLP) support the modular\nconstruction of expressive architectures and tend to have better recognition\naccuracy but are often regarded as black boxes in terms of interpretability.\nOur approach aims to strike a better balance between these two aspects through\nthe use of a building block based on NMF that incorporates supervised neural\nnetwork training methods to achieve high predictive performance while retaining\nthe desirable interpretability properties of NMF. We evaluate our Predictive\nFactorized Coupling (PFC) block on small datasets and show that it achieves\ncompetitive predictive performance with MLPs while also offering improved\ninterpretability. We demonstrate the benefits of this approach in various\nscenarios, such as continual learning, training on non-i.i.d. data, and\nknowledge removal after training. Additionally, we show examples of using the\nPFC block to build more expressive architectures, including a fully-connected\nresidual network as well as a factorized recurrent neural network (RNN) that\nperforms competitively with vanilla RNNs while providing improved\ninterpretability. The PFC block uses an iterative inference algorithm that\nconverges to a fixed point, making it possible to trade off accuracy vs\ncomputation after training but also currently preventing its use as a general\nMLP replacement in some scenarios such as training on very large datasets. We\nprovide source code at https://github.com/bkvogel/pfc\n","authors":["Brian K. Vogel"],"pdf_url":"https://arxiv.org/pdf/2311.11485v1.pdf","comment":"42 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.10251v2","updated":"2023-11-20T01:59:11Z","published":"2023-11-17T00:44:56Z","title":"UniMOS: A Universal Framework For Multi-Organ Segmentation Over\n Label-Constrained Datasets","summary":" Machine learning models for medical images can help physicians diagnose and\nmanage diseases. However, due to the fact that medical image annotation\nrequires a great deal of manpower and expertise, as well as the fact that\nclinical departments perform image annotation based on task orientation, there\nis the problem of having fewer medical image annotation data with more\nunlabeled data and having many datasets that annotate only a single organ. In\nthis paper, we present UniMOS, the first universal framework for achieving the\nutilization of fully and partially labeled images as well as unlabeled images.\nSpecifically, we construct a Multi-Organ Segmentation (MOS) module over\nfully/partially labeled data as the basenet and designed a new target adaptive\nloss. Furthermore, we incorporate a semi-supervised training module that\ncombines consistent regularization and pseudolabeling techniques on unlabeled\ndata, which significantly improves the segmentation of unlabeled data.\nExperiments show that the framework exhibits excellent performance in several\nmedical image segmentation tasks compared to other advanced methods, and also\nsignificantly improves data utilization and reduces annotation cost. Code and\nmodels are available at: https://github.com/lw8807001/UniMOS.\n","authors":["Can Li","Sheng Shao","Junyi Qu","Shuchao Pang","Mehmet A. Orgun"],"pdf_url":"https://arxiv.org/pdf/2311.10251v2.pdf","comment":"Accepted by BIBM2023"},{"id":"http://arxiv.org/abs/2311.11483v1","updated":"2023-11-20T01:58:27Z","published":"2023-11-20T01:58:27Z","title":"A Multi-Center Study on the Adaptability of a Shared Foundation Model\n for Electronic Health Records","summary":" Foundation models hold promise for transforming AI in healthcare by providing\nmodular components that are easily adaptable to downstream healthcare tasks,\nmaking AI development more scalable and cost-effective. Structured EHR\nfoundation models, trained on coded medical records from millions of patients,\ndemonstrated benefits including increased performance with fewer training\nlabels, and improved robustness to distribution shifts. However, questions\nremain on the feasibility of sharing these models across different hospitals\nand their performance for local task adaptation. This multi-center study\nexamined the adaptability of a recently released structured EHR foundation\nmodel ($FM_{SM}$), trained on longitudinal medical record data from 2.57M\nStanford Medicine patients. Experiments were conducted using EHR data at The\nHospital for Sick Children and MIMIC-IV. We assessed both adaptability via\ncontinued pretraining on local data, and task adaptability compared to\nbaselines of training models from scratch at each site, including a local\nfoundation model. We evaluated the performance of these models on 8 clinical\nprediction tasks. In both datasets, adapting the off-the-shelf $FM_{SM}$\nmatched the performance of GBM models locally trained on all data while\nproviding a 13% improvement in settings with few task-specific training labels.\nWith continued pretraining on local data, label efficiency substantially\nimproved, such that $FM_{SM}$ required fewer than 1% of training examples to\nmatch the fully trained GBM's performance. Continued pretraining was also 60 to\n90% more sample-efficient than training local foundation models from scratch.\nOur findings show that adapting shared EHR foundation models across hospitals\nprovides improved prediction performance at less cost, underscoring the utility\nof base foundation models as modular components to streamline the development\nof healthcare AI.\n","authors":["Lin Lawrence Guo","Jason Fries","Ethan Steinberg","Scott Lanyon Fleming","Keith Morse","Catherine Aftandilian","Jose Posada","Nigam Shah","Lillian Sung"],"pdf_url":"https://arxiv.org/pdf/2311.11483v1.pdf","comment":"41 pages, 3 figures, 2 tables, 16 appendices"},{"id":"http://arxiv.org/abs/2305.16300v2","updated":"2023-11-20T01:16:17Z","published":"2023-05-25T17:53:42Z","title":"Landmark Attention: Random-Access Infinite Context Length for\n Transformers","summary":" While Transformers have shown remarkable success in natural language\nprocessing, their attention mechanism's large memory requirements have limited\ntheir ability to handle longer contexts. Prior approaches, such as recurrent\nmemory or retrieval-based augmentation, have either compromised the\nrandom-access flexibility of attention (i.e., the capability to select any\ntoken in the entire context) or relied on separate mechanisms for relevant\ncontext retrieval, which may not be compatible with the model's attention. In\nthis paper, we present a novel approach that allows access to the complete\ncontext while retaining random-access flexibility, closely resembling running\nattention on the entire context. Our method uses a landmark token to represent\neach block of the input and trains the attention to use it for selecting\nrelevant blocks, enabling retrieval of blocks directly through the attention\nmechanism instead of by relying on a separate mechanism. Our approach\nseamlessly integrates with specialized data structures and the system's memory\nhierarchy, enabling processing of arbitrarily long context lengths. We\ndemonstrate that our method can obtain comparable performance with\nTransformer-XL while significantly reducing the number of retrieved tokens in\neach step. Finally, we show that fine-tuning LLaMA 7B with our method\nsuccessfully extends its context length capacity to over 32k tokens, allowing\nfor inference at the context lengths of GPT-4. We release the implementation of\nlandmark attention and the code to reproduce our experiments at\nhttps://github.com/epfml/landmark-attention/.\n","authors":["Amirkeivan Mohtashami","Martin Jaggi"],"pdf_url":"https://arxiv.org/pdf/2305.16300v2.pdf","comment":"Published as a conference paper at NeurIPS 2023 - 37th Conference on\n Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2311.11475v1","updated":"2023-11-20T00:59:20Z","published":"2023-11-20T00:59:20Z","title":"Gaussian Interpolation Flows","summary":" Gaussian denoising has emerged as a powerful principle for constructing\nsimulation-free continuous normalizing flows for generative modeling. Despite\ntheir empirical successes, theoretical properties of these flows and the\nregularizing effect of Gaussian denoising have remained largely unexplored. In\nthis work, we aim to address this gap by investigating the well-posedness of\nsimulation-free continuous normalizing flows built on Gaussian denoising.\nThrough a unified framework termed Gaussian interpolation flow, we establish\nthe Lipschitz regularity of the flow velocity field, the existence and\nuniqueness of the flow, and the Lipschitz continuity of the flow map and the\ntime-reversed flow map for several rich classes of target distributions. This\nanalysis also sheds light on the auto-encoding and cycle-consistency properties\nof Gaussian interpolation flows. Additionally, we delve into the stability of\nthese flows in source distributions and perturbations of the velocity field,\nusing the quadratic Wasserstein distance as a metric. Our findings offer\nvaluable insights into the learning techniques employed in Gaussian\ninterpolation flows for generative modeling, providing a solid theoretical\nfoundation for end-to-end error analyses of learning GIFs with empirical\nobservations.\n","authors":["Yuan Gao","Jian Huang","Yuling Jiao"],"pdf_url":"https://arxiv.org/pdf/2311.11475v1.pdf","comment":"49 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.11473v1","updated":"2023-11-20T00:57:30Z","published":"2023-11-20T00:57:30Z","title":"CSGNN: Conquering Noisy Node labels via Dynamic Class-wise Selection","summary":" Graph Neural Networks (GNNs) have emerged as a powerful tool for\nrepresentation learning on graphs, but they often suffer from overfitting and\nlabel noise issues, especially when the data is scarce or imbalanced. Different\nfrom the paradigm of previous methods that rely on single-node confidence, in\nthis paper, we introduce a novel Class-wise Selection for Graph Neural\nNetworks, dubbed CSGNN, which employs a neighbor-aggregated latent space to\nadaptively select reliable nodes across different classes. Specifically, 1) to\ntackle the class imbalance issue, we introduce a dynamic class-wise selection\nmechanism, leveraging the clustering technique to identify clean nodes based on\nthe neighbor-aggregated confidences. In this way, our approach can avoid the\npitfalls of biased sampling which is common with global threshold techniques.\n2) To alleviate the problem of noisy labels, built on the concept of the\nmemorization effect, CSGNN prioritizes learning from clean nodes before noisy\nones, thereby iteratively enhancing model performance while mitigating label\nnoise. Through extensive experiments, we demonstrate that CSGNN outperforms\nstate-of-the-art methods in terms of both effectiveness and robustness.\n","authors":["Yifan Li","Zhen Tan","Kai Shu","Zongsheng Cao","Yu Kong","Huan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.11473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15308v2","updated":"2023-11-20T00:56:15Z","published":"2023-10-23T19:21:57Z","title":"SAM-CLIP: Merging Vision Foundation Models towards Semantic and Spatial\n Understanding","summary":" The landscape of publicly available vision foundation models (VFMs), such as\nCLIP and Segment Anything Model (SAM), is expanding rapidly. VFMs are endowed\nwith distinct capabilities stemming from their pre-training objectives. For\ninstance, CLIP excels in semantic understanding, while SAM specializes in\nspatial understanding for segmentation. In this work, we introduce a simple\nrecipe to efficiently merge VFMs into a unified model that absorbs their\nexpertise. Our method integrates techniques of multi-task learning, continual\nlearning, and distillation. Further, it demands significantly less\ncomputational cost compared to traditional multi-task training from scratch,\nand it only needs a small fraction of the pre-training datasets that were\ninitially used to train individual models. By applying our method to SAM and\nCLIP, we obtain SAM-CLIP: a unified model that combines the capabilities of SAM\nand CLIP into a single vision transformer. Compared with deploying SAM and CLIP\nindependently, our merged model, SAM-CLIP, reduces storage and compute costs\nfor inference, making it well-suited for edge device applications. We show that\nSAM-CLIP not only retains the foundational strengths of SAM and CLIP, but also\nintroduces synergistic functionalities, notably in zero-shot semantic\nsegmentation, where SAM-CLIP establishes new state-of-the-art results on 5\nbenchmarks. It outperforms previous models that are specifically designed for\nthis task by a large margin, including +6.8% and +5.9% mean IoU improvement on\nPascal-VOC and COCO-Stuff datasets, respectively.\n","authors":["Haoxiang Wang","Pavan Kumar Anasosalu Vasu","Fartash Faghri","Raviteja Vemulapalli","Mehrdad Farajtabar","Sachin Mehta","Mohammad Rastegari","Oncel Tuzel","Hadi Pouransari"],"pdf_url":"https://arxiv.org/pdf/2310.15308v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11463v1","updated":"2023-11-20T00:15:16Z","published":"2023-11-20T00:15:16Z","title":"Towards a Post-Market Monitoring Framework for Machine Learning-based\n Medical Devices: A case study","summary":" After a machine learning (ML)-based system is deployed in clinical practice,\nperformance monitoring is important to ensure the safety and effectiveness of\nthe algorithm over time. The goal of this work is to highlight the complexity\nof designing a monitoring strategy and the need for a systematic framework that\ncompares the multitude of monitoring options. One of the main decisions is\nchoosing between using real-world (observational) versus interventional data.\nAlthough the former is the most convenient source of monitoring data, it\nexhibits well-known biases, such as confounding, selection, and missingness. In\nfact, when the ML algorithm interacts with its environment, the algorithm\nitself may be a primary source of bias. On the other hand, a carefully designed\ninterventional study that randomizes individuals can explicitly eliminate such\nbiases, but the ethics, feasibility, and cost of such an approach must be\ncarefully considered. Beyond the decision of the data source, monitoring\nstrategies vary in the performance criteria they track, the interpretability of\nthe test statistics, the strength of their assumptions, and their speed at\ndetecting performance decay. As a first step towards developing a framework\nthat compares the various monitoring options, we consider a case study of an\nML-based risk prediction algorithm for postoperative nausea and vomiting\n(PONV). Bringing together tools from causal inference and statistical process\ncontrol, we walk through the basic steps of defining candidate monitoring\ncriteria, describing potential sources of bias and the causal model, and\nspecifying and comparing candidate monitoring procedures. We hypothesize that\nthese steps can be applied more generally, as causal inference can address\nother sources of biases as well.\n","authors":["Jean Feng","Adarsh Subbaswamy","Alexej Gossmann","Harvineet Singh","Berkman Sahiner","Mi-Ok Kim","Gene Pennello","Nicholas Petrick","Romain Pirracchio","Fan Xia"],"pdf_url":"https://arxiv.org/pdf/2311.11463v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.11892v1","updated":"2023-11-20T16:25:23Z","published":"2023-11-20T16:25:23Z","title":"Multimodal Characterization of Emotion within Multimedia Space","summary":" Technological advancement and its omnipresent connection have pushed humans\npast the boundaries and limitations of a computer screen, physical state, or\ngeographical location. It has provided a depth of avenues that facilitate\nhuman-computer interaction that was once inconceivable such as audio and body\nlanguage detection. Given the complex modularities of emotions, it becomes\nvital to study human-computer interaction, as it is the commencement of a\nthorough understanding of the emotional state of users and, in the context of\nsocial networks, the producers of multimodal information. This study first\nacknowledges the accuracy of classification found within multimodal emotion\ndetection systems compared to unimodal solutions. Second, it explores the\ncharacterization of multimedia content produced based on their emotions and the\ncoherence of emotion in different modalities by utilizing deep learning models\nto classify emotion across different modalities.\n","authors":["Dayo Samuel Banjo","Connice Trimmingham","Niloofar Yousefi","Nitin Agarwal"],"pdf_url":"https://arxiv.org/pdf/2311.11892v1.pdf","comment":"8 pages, Published in International Conference on Computers and\n Computation (COMPUTE 2022), November 03-04, 2022, San Francisco, United\n States"},{"id":"http://arxiv.org/abs/2311.11783v1","updated":"2023-11-20T14:09:13Z","published":"2023-11-20T14:09:13Z","title":"CityScope: Enhanced Localozation and Synchronizing AR for Dynamic Urban\n Weather Visualization","summary":" CityScope uses augmented reality (AR) to change our interaction with weather\ndata. The main goal is to develop real-time 3D weather visualizations, with\nTaiwan as the model. It displays live weather data from the Central Weather\nBureau (CWB), projected onto a physical representation of Taiwan's landscape. A\npivotal advancement in our project is the integration of AprilTag with plane\ndetection technology. This innovative combination significantly enhances the\nprecision of the virtual visualizations within the physical world. By\naccurately aligning AR elements with real-world environments, CityScope\nachieves a seamless and realistic amalgamation of weather data and the physical\nterrain of Taiwan. This breakthrough in AR technology not only enhances the\naccuracy of weather visualizations but also enriches user experience, offering\nan immersive and interactive way to understand and engage with meteorological\ninformation. CityScope stands as a testament to the potential of AR in\ntransforming data visualization and public engagement in meteorology.\n","authors":["Tzu Hsin Hsieh"],"pdf_url":"https://arxiv.org/pdf/2311.11783v1.pdf","comment":"9 pages, 15 figures"},{"id":"http://arxiv.org/abs/2311.12159v1","updated":"2023-11-20T20:24:45Z","published":"2023-11-20T20:24:45Z","title":"Conditional Modeling Based Automatic Video Summarization","summary":" The aim of video summarization is to shorten videos automatically while\nretaining the key information necessary to convey the overall story. Video\nsummarization methods mainly rely on visual factors, such as visual\nconsecutiveness and diversity, which may not be sufficient to fully understand\nthe content of the video. There are other non-visual factors, such as\ninterestingness, representativeness, and storyline consistency that should also\nbe considered for generating high-quality video summaries. Current methods do\nnot adequately take into account these non-visual factors, resulting in\nsuboptimal performance. In this work, a new approach to video summarization is\nproposed based on insights gained from how humans create ground truth video\nsummaries. The method utilizes a conditional modeling perspective and\nintroduces multiple meaningful random variables and joint distributions to\ncharacterize the key components of video summarization. Helper distributions\nare employed to improve the training of the model. A conditional attention\nmodule is designed to mitigate potential performance degradation in the\npresence of multi-modal input. The proposed video summarization method\nincorporates the above innovative design choices that aim to narrow the gap\nbetween human-generated and machine-generated video summaries. Extensive\nexperiments show that the proposed approach outperforms existing methods and\nachieves state-of-the-art performance on commonly used video summarization\ndatasets.\n","authors":["Jia-Hong Huang","Chao-Han Huck Yang","Pin-Yu Chen","Min-Hung Chen","Marcel Worring"],"pdf_url":"https://arxiv.org/pdf/2311.12159v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n arXiv admin note: substantial text overlap with arXiv:2305.00455"},{"id":"http://arxiv.org/abs/2311.11642v1","updated":"2023-11-20T10:01:13Z","published":"2023-11-20T10:01:13Z","title":"Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging","summary":" Video face re-aging deals with altering the apparent age of a person to the\ntarget age in videos. This problem is challenging due to the lack of paired\nvideo datasets maintaining temporal consistency in identity and age. Most\nre-aging methods process each image individually without considering the\ntemporal consistency of videos. While some existing works address the issue of\ntemporal coherence through video facial attribute manipulation in latent space,\nthey often fail to deliver satisfactory performance in age transformation. To\ntackle the issues, we propose (1) a novel synthetic video dataset that features\nsubjects across a diverse range of age groups; (2) a baseline architecture\ndesigned to validate the effectiveness of our proposed dataset, and (3) the\ndevelopment of three novel metrics tailored explicitly for evaluating the\ntemporal consistency of video re-aging techniques. Our comprehensive\nexperiments on public datasets, such as VFHQ and CelebV-HQ, show that our\nmethod outperforms the existing approaches in terms of both age transformation\nand temporal consistency.\n","authors":["Abdul Muqeet","Kyuchul Lee","Bumsoo Kim","Yohan Hong","Hyungrae Lee","Woonggon Kim","Kwang Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11642v1.pdf","comment":"8 pages, 6 figures, 4 tables"}]},"2023-11-21T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.09387v2","updated":"2023-11-21T18:31:57Z","published":"2023-11-15T21:30:26Z","title":"Banach-Tarski Embeddings and Transformers","summary":" We introduce a new construction of embeddings of arbitrary recursive data\nstructures into high dimensional vectors. These embeddings provide an\ninterpretable model for the latent state vectors of transformers. We\ndemonstrate that these embeddings can be decoded to the original data structure\nwhen the embedding dimension is sufficiently large. This decoding algorithm has\na natural implementation as a transformer. We also show that these embedding\nvectors can be manipulated directly to perform computations on the underlying\ndata without decoding. As an example we present an algorithm that constructs\nthe embedded parse tree of an embedded token sequence using only vector\noperations in embedding space.\n","authors":["Joshua Maher"],"pdf_url":"https://arxiv.org/pdf/2311.09387v2.pdf","comment":"22 pages, 7 figures. v2: Fixed order of matrix multiplication in\n section 2.4"},{"id":"http://arxiv.org/abs/2310.02168v2","updated":"2023-11-21T18:18:49Z","published":"2023-10-03T16:02:36Z","title":"Editing Personality for LLMs","summary":" This paper introduces an innovative task focused on editing the personality\ntraits of Large Language Models (LLMs). This task seeks to adjust the models'\nresponses to opinion-related questions on specified topics since an\nindividual's personality often manifests in the form of their expressed\nopinions, thereby showcasing different personality traits. Specifically, we\nconstruct a new benchmark dataset PersonalityEdit to address this task. Drawing\non the theory in Social Psychology, we isolate three representative traits,\nnamely Neuroticism, Extraversion, and Agreeableness, as the foundation for our\nbenchmark. We then gather data using GPT-4, generating responses that not only\nalign with a specified topic but also embody the targeted personality trait. We\nconduct comprehensive experiments involving various baselines and discuss the\nrepresentation of personality behavior in LLMs. Our intriguing findings uncover\npotential challenges of the proposed task, illustrating several remaining\nissues. We anticipate that our work can provide the NLP community with\ninsights. Code and datasets will be released at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Shengyu Mao","Ningyu Zhang","Xiaohan Wang","Mengru Wang","Yunzhi Yao","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02168v2.pdf","comment":"Work in progress, add more experiments"},{"id":"http://arxiv.org/abs/2310.02129v2","updated":"2023-11-21T17:59:04Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":" As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code is available at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v2.pdf","comment":"Work in progress, add more experiments"},{"id":"http://arxiv.org/abs/2311.12735v1","updated":"2023-11-21T17:21:15Z","published":"2023-11-21T17:21:15Z","title":"LowResource at BLP-2023 Task 2: Leveraging BanglaBert for Low Resource\n Sentiment Analysis of Bangla Language","summary":" This paper describes the system of the LowResource Team for Task 2 of\nBLP-2023, which involves conducting sentiment analysis on a dataset composed of\npublic posts and comments from diverse social media platforms. Our primary aim\nis to utilize BanglaBert, a BERT model pre-trained on a large Bangla corpus,\nusing various strategies including fine-tuning, dropping random tokens, and\nusing several external datasets. Our final model is an ensemble of the three\nbest BanglaBert variations. Our system has achieved overall 3rd in the Test Set\namong 30 participating teams with a score of 0.718. Additionally, we discuss\nthe promising systems that didn't perform well namely task-adaptive pertaining\nand paraphrasing using BanglaT5. Training codes and external datasets which are\nused for our system are publicly available at\nhttps://github.com/Aunabil4602/bnlp-workshop-task2-2023\n","authors":["Aunabil Chakma","Masum Hasan"],"pdf_url":"https://arxiv.org/pdf/2311.12735v1.pdf","comment":"Accepted at BLP Workshop @EMNLP2023"},{"id":"http://arxiv.org/abs/2311.12727v1","updated":"2023-11-21T17:03:21Z","published":"2023-11-21T17:03:21Z","title":"Soft Random Sampling: A Theoretical and Empirical Analysis","summary":" Soft random sampling (SRS) is a simple yet effective approach for efficient\ntraining of large-scale deep neural networks when dealing with massive data.\nSRS selects a subset uniformly at random with replacement from the full data\nset in each epoch. In this paper, we conduct a theoretical and empirical\nanalysis of SRS. First, we analyze its sampling dynamics including data\ncoverage and occupancy. Next, we investigate its convergence with non-convex\nobjective functions and give the convergence rate. Finally, we provide its\ngeneralization performance. We empirically evaluate SRS for image recognition\non CIFAR10 and automatic speech recognition on Librispeech and an in-house\npayload dataset to demonstrate its effectiveness. Compared to existing\ncoreset-based data selection methods, SRS offers a better accuracy-efficiency\ntrade-off. Especially on real-world industrial scale data sets, it is shown to\nbe a powerful training strategy with significant speedup and competitive\nperformance with almost no additional computing cost.\n","authors":["Xiaodong Cui","Ashish Mittal","Songtao Lu","Wei Zhang","George Saon","Brian Kingsbury"],"pdf_url":"https://arxiv.org/pdf/2311.12727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10852v6","updated":"2023-11-21T16:36:43Z","published":"2022-05-22T15:30:18Z","title":"Relphormer: Relational Graph Transformer for Knowledge Graph\n Representations","summary":" Transformers have achieved remarkable performance in widespread fields,\nincluding natural language processing, computer vision and graph mining.\nHowever, vanilla Transformer architectures have not yielded promising\nimprovements in the Knowledge Graph (KG) representations, where the\ntranslational distance paradigm dominates this area. Note that vanilla\nTransformer architectures struggle to capture the intrinsically heterogeneous\nstructural and semantic information of knowledge graphs. To this end, we\npropose a new variant of Transformer for knowledge graph representations dubbed\nRelphormer. Specifically, we introduce Triple2Seq which can dynamically sample\ncontextualized sub-graph sequences as the input to alleviate the heterogeneity\nissue. We propose a novel structure-enhanced self-attention mechanism to encode\nthe relational information and keep the semantic information within entities\nand relations. Moreover, we utilize masked knowledge modeling for general\nknowledge graph representation learning, which can be applied to various\nKG-based tasks including knowledge graph completion, question answering, and\nrecommendation. Experimental results on six datasets show that Relphormer can\nobtain better performance compared with baselines. Code is available in\nhttps://github.com/zjunlp/Relphormer.\n","authors":["Zhen Bi","Siyuan Cheng","Jing Chen","Xiaozhuan Liang","Feiyu Xiong","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.10852v6.pdf","comment":"Neurocomputing 2023"},{"id":"http://arxiv.org/abs/2306.17103v3","updated":"2023-11-21T16:32:41Z","published":"2023-06-29T17:01:51Z","title":"LyricWhiz: Robust Multilingual Zero-shot Lyrics Transcription by\n Whispering to ChatGPT","summary":" We introduce LyricWhiz, a robust, multilingual, and zero-shot automatic\nlyrics transcription method achieving state-of-the-art performance on various\nlyrics transcription datasets, even in challenging genres such as rock and\nmetal. Our novel, training-free approach utilizes Whisper, a weakly supervised\nrobust speech recognition model, and GPT-4, today's most performant chat-based\nlarge language model. In the proposed method, Whisper functions as the \"ear\" by\ntranscribing the audio, while GPT-4 serves as the \"brain,\" acting as an\nannotator with a strong performance for contextualized output selection and\ncorrection. Our experiments show that LyricWhiz significantly reduces Word\nError Rate compared to existing methods in English and can effectively\ntranscribe lyrics across multiple languages. Furthermore, we use LyricWhiz to\ncreate the first publicly available, large-scale, multilingual lyrics\ntranscription dataset with a CC-BY-NC-SA copyright license, based on\nMTG-Jamendo, and offer a human-annotated subset for noise level estimation and\nevaluation. We anticipate that our proposed method and dataset will advance the\ndevelopment of multilingual lyrics transcription, a challenging and emerging\ntask.\n","authors":["Le Zhuo","Ruibin Yuan","Jiahao Pan","Yinghao Ma","Yizhi LI","Ge Zhang","Si Liu","Roger Dannenberg","Jie Fu","Chenghua Lin","Emmanouil Benetos","Wenhu Chen","Wei Xue","Yike Guo"],"pdf_url":"https://arxiv.org/pdf/2306.17103v3.pdf","comment":"9 pages, 2 figures, 5 tables, accepted by ISMIR 2023"},{"id":"http://arxiv.org/abs/2311.12707v1","updated":"2023-11-21T16:20:49Z","published":"2023-11-21T16:20:49Z","title":"Keeping Users Engaged During Repeated Administration of the Same\n Questionnaire: Using Large Language Models to Reliably Diversify Questions","summary":" Standardized, validated questionnaires are vital tools in HCI research and\nhealthcare, offering dependable self-report data. However, their repeated use\nin longitudinal or pre-post studies can induce respondent fatigue, impacting\ndata quality via response biases and decreased response rates. We propose\nutilizing large language models (LLMs) to generate diverse questionnaire\nversions while retaining good psychometric properties. In a longitudinal study,\nparticipants engaged with our agent system and responded daily for two weeks to\neither a standardized depression questionnaire or one of two LLM-generated\nquestionnaire variants, alongside a validated depression questionnaire.\nPsychometric testing revealed consistent covariation between the external\ncriterion and the focal measure administered across the three conditions,\ndemonstrating the reliability and validity of the LLM-generated variants.\nParticipants found the repeated administration of the standardized\nquestionnaire significantly more repetitive compared to the variants. Our\nfindings highlight the potential of LLM-generated variants to invigorate\nquestionnaires, fostering engagement and interest without compromising\nvalidity.\n","authors":["Hye Sun Yun","Mehdi Arjmand","Phillip Raymond Sherlock","Michael Paasche-Orlow","James W. Griffith","Timothy Bickmore"],"pdf_url":"https://arxiv.org/pdf/2311.12707v1.pdf","comment":"22 pages, preprint"},{"id":"http://arxiv.org/abs/2311.12699v1","updated":"2023-11-21T16:03:51Z","published":"2023-11-21T16:03:51Z","title":"Can Large Language Models Understand Content and Propagation for\n Misinformation Detection: An Empirical Study","summary":" Large Language Models (LLMs) have garnered significant attention for their\npowerful ability in natural language understanding and reasoning. In this\npaper, we present a comprehensive empirical study to explore the performance of\nLLMs on misinformation detection tasks. This study stands as the pioneering\ninvestigation into the understanding capabilities of multiple LLMs regarding\nboth content and propagation across social media platforms. Our empirical\nstudies on five misinformation detection datasets show that LLMs with diverse\nprompts achieve comparable performance in text-based misinformation detection\nbut exhibit notably constrained capabilities in comprehending propagation\nstructure compared to existing models in propagation-based misinformation\ndetection. Besides, we further design four instruction-tuned strategies to\nenhance LLMs for both content and propagation-based misinformation detection.\nThese strategies boost LLMs to actively learn effective features from multiple\ninstances or hard instances, and eliminate irrelevant propagation structures,\nthereby achieving better detection performance. Extensive experiments further\ndemonstrate LLMs would play a better capacity in content and propagation\nstructure under these proposed strategies and achieve promising detection\nperformance. These findings highlight the potential ability of LLMs to detect\nmisinformation.\n","authors":["Mengyang Chen","Lingwei Wei","Han Cao","Wei Zhou","Songlin Hu"],"pdf_url":"https://arxiv.org/pdf/2311.12699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12689v1","updated":"2023-11-21T15:51:06Z","published":"2023-11-21T15:51:06Z","title":"Fair Text Classification with Wasserstein Independence","summary":" Group fairness is a central research topic in text classification, where\nreaching fair treatment between sensitive groups (e.g. women vs. men) remains\nan open challenge. This paper presents a novel method for mitigating biases in\nneural text classification, agnostic to the model architecture. Considering the\ndifficulty to distinguish fair from unfair information in a text encoder, we\ntake inspiration from adversarial training to induce Wasserstein independence\nbetween representations learned to predict our target label and the ones\nlearned to predict some sensitive attribute. Our approach provides two\nsignificant advantages. Firstly, it does not require annotations of sensitive\nattributes in both testing and training data. This is more suitable for\nreal-life scenarios compared to existing methods that require annotations of\nsensitive attributes at train time. Second, our approach exhibits a comparable\nor better fairness-accuracy trade-off compared to existing methods.\n","authors":["Thibaud Leteno","Antoine Gourru","Charlotte Laclau","Rémi Emonet","Christophe Gravier"],"pdf_url":"https://arxiv.org/pdf/2311.12689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.12311v3","updated":"2023-11-21T15:40:54Z","published":"2020-12-22T19:32:52Z","title":"Influencer Videos: Unboxing the Mystique","summary":" Influencer marketing has become a very popular tool to reach customers.\nDespite the rapid growth in influencer videos, there has been little research\non the effectiveness of their constituent features in explaining video\nengagement. We study YouTube influencers and analyze their unstructured video\ndata across text, audio and images using an \"interpretable deep learning\"\nframework that accomplishes both goals of prediction and interpretation. Our\nprediction-based approach analyzes unstructured data and finds that \"what is\nsaid\" in words (text) is more influential than \"how it is said\" in imagery\n(images) or acoustics (audio). Our novel interpretation-based approach is\nimplemented after completion of model prediction by analyzing the same source\nof unstructured data to measure importance attributed to the video features. We\neliminate several spurious relationships in two steps, identifying a subset of\nrelationships which are confirmed using theory. We uncover novel findings that\nestablish distinct associations for measures of shallow and deep engagement\nbased on the dual-system framework of human thinking. Our approach is validated\nusing simulated data, and we discuss the learnings from our findings for\ninfluencers and brands.\n","authors":["Prashant Rajaram","Puneet Manchanda"],"pdf_url":"https://arxiv.org/pdf/2012.12311v3.pdf","comment":"45 pages, Online Appendix"},{"id":"http://arxiv.org/abs/2311.12664v1","updated":"2023-11-21T15:14:54Z","published":"2023-11-21T15:14:54Z","title":"The DURel Annotation Tool: Human and Computational Measurement of\n Semantic Proximity, Sense Clusters and Semantic Change","summary":" We present the DURel tool that implements the annotation of semantic\nproximity between uses of words into an online, open source interface. The tool\nsupports standardized human annotation as well as computational annotation,\nbuilding on recent advances with Word-in-Context models. Annotator judgments\nare clustered with automatic graph clustering techniques and visualized for\nanalysis. This allows to measure word senses with simple and intuitive\nmicro-task judgments between use pairs, requiring minimal preparation efforts.\nThe tool offers additional functionalities to compare the agreement between\nannotators to guarantee the inter-subjectivity of the obtained judgments and to\ncalculate summary statistics giving insights into sense frequency\ndistributions, semantic variation or changes of senses over time.\n","authors":["Dominik Schlechtweg","Shafqat Mumtaz Virk","Pauline Sander","Emma Sköldberg","Lukas Theuer Linke","Tuo Zhang","Nina Tahmasebi","Jonas Kuhn","Sabine Schulte im Walde"],"pdf_url":"https://arxiv.org/pdf/2311.12664v1.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2311.12649v1","updated":"2023-11-21T14:49:00Z","published":"2023-11-21T14:49:00Z","title":"MathGloss: Building mathematical glossaries from text","summary":" MathGloss is a project to create a knowledge graph (KG) for undergraduate\nmathematics from text, automatically, using modern natural language processing\n(NLP) tools and resources already available on the web. MathGloss is a linked\ndatabase of undergraduate concepts in mathematics. So far, it combines five\nresources: (i) Wikidata, a collaboratively edited, multilingual knowledge graph\nhosted by the Wikimedia Foundation, (ii) terms covered in mathematics courses\nat the University of Chicago, (iii) the syllabus of the French undergraduate\nmathematics curriculum which includes hyperlinks to the automated theorem\nprover Lean 4, (iv) MuLiMa, a multilingual dictionary of mathematics curated by\nmathematicians, and (v) the nLab, a wiki for category theory also curated by\nmathematicians. MathGloss's goal is to bring together resources for learning\nmathematics and to allow every mathematician to tailor their learning to their\nown preferences. Moreover, by organizing different resources for learning\nundergraduate mathematics alongside those for learning formal mathematics, we\nhope to make it easier for mathematicians and formal tools (theorem provers,\ncomputer algebra systems, etc) experts to \"understand\" each other and break\ndown some of the barriers to formal math.\n","authors":["Lucy Horowitz","Valeria de Paiva"],"pdf_url":"https://arxiv.org/pdf/2311.12649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01446v3","updated":"2023-11-21T14:02:33Z","published":"2023-09-04T08:54:20Z","title":"Open Sesame! Universal Black Box Jailbreaking of Large Language Models","summary":" Large language models (LLMs), designed to provide helpful and safe responses,\noften rely on alignment techniques to align with user intent and social\nguidelines. Unfortunately, this alignment can be exploited by malicious actors\nseeking to manipulate an LLM's outputs for unintended purposes. In this paper\nwe introduce a novel approach that employs a genetic algorithm (GA) to\nmanipulate LLMs when model architecture and parameters are inaccessible. The GA\nattack works by optimizing a universal adversarial prompt that -- when combined\nwith a user's query -- disrupts the attacked model's alignment, resulting in\nunintended and potentially harmful outputs. Our novel approach systematically\nreveals a model's limitations and vulnerabilities by uncovering instances where\nits responses deviate from expected behavior. Through extensive experiments we\ndemonstrate the efficacy of our technique, thus contributing to the ongoing\ndiscussion on responsible AI development by providing a diagnostic tool for\nevaluating and enhancing alignment of LLMs with human intent. To our knowledge\nthis is the first automated universal black box jailbreak attack.\n","authors":["Raz Lapid","Ron Langberg","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2309.01446v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12574v1","updated":"2023-11-21T12:40:01Z","published":"2023-11-21T12:40:01Z","title":"IMGTB: A Framework for Machine-Generated Text Detection Benchmarking","summary":" In the era of large language models generating high quality texts, it is a\nnecessity to develop methods for detection of machine-generated text to avoid\nharmful use or simply due to annotation purposes. It is, however, also\nimportant to properly evaluate and compare such developed methods. Recently, a\nfew benchmarks have been proposed for this purpose; however, integration of\nnewest detection methods is rather challenging, since new methods appear each\nmonth and provide slightly different evaluation pipelines. In this paper, we\npresent the IMGTB framework, which simplifies the benchmarking of\nmachine-generated text detection methods by easy integration of custom (new)\nmethods and evaluation datasets. Its configurability and flexibility makes\nresearch and development of new detection methods easier, especially their\ncomparison to the existing state-of-the-art detectors. The default set of\nanalyses, metrics and visualizations offered by the tool follows the\nestablished practices of machine-generated text detection benchmarking found in\nstate-of-the-art literature.\n","authors":["Michal Spiegel","Dominik Macko"],"pdf_url":"https://arxiv.org/pdf/2311.12574v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12538v1","updated":"2023-11-21T11:33:03Z","published":"2023-11-21T11:33:03Z","title":"In-Context Learning Functions with Varying Number of Minima","summary":" Large Language Models (LLMs) have proven effective at In-Context Learning\n(ICL), an ability that allows them to create predictors from labeled examples.\nFew studies have explored the interplay between ICL and specific properties of\nfunctions it attempts to approximate. In our study, we use a formal framework\nto explore ICL and propose a new task of approximating functions with varying\nnumber of minima. We implement a method that allows for producing functions\nwith given inputs as minima. We find that increasing the number of minima\ndegrades ICL performance. At the same time, our evaluation shows that ICL\noutperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster\nthan 2NN in all settings. We validate the findings through a set of few-shot\nexperiments across various hyperparameter configurations.\n","authors":["David Oniani","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12537v1","updated":"2023-11-21T11:32:23Z","published":"2023-11-21T11:32:23Z","title":"Oasis: Data Curation and Assessment System for Pretraining of Large\n Language Models","summary":" Data is one of the most critical elements in building a large language model.\nHowever, existing systems either fail to customize a corpus curation pipeline\nor neglect to leverage comprehensive corpus assessment for iterative\noptimization of the curation. To this end, we present a pretraining corpus\ncuration and assessment platform called Oasis -- a one-stop system for data\nquality improvement and quantification with user-friendly interactive\ninterfaces. Specifically, the interactive modular rule filter module can devise\ncustomized rules according to explicit feedback. The debiased neural filter\nmodule builds the quality classification dataset in a negative-centric manner\nto remove the undesired bias. The adaptive document deduplication module could\nexecute large-scale deduplication with limited memory resources. These three\nparts constitute the customized data curation module. And in the holistic data\nassessment module, a corpus can be assessed in local and global views, with\nthree evaluation means including human, GPT-4, and heuristic metrics. We\nexhibit a complete process to use Oasis for the curation and assessment of\npretraining data. In addition, an 800GB bilingual corpus curated by Oasis is\npublicly released.\n","authors":["Tong Zhou","Yubo Chen","Pengfei Cao","Kang Liu","Jun Zhao","Shengping Liu"],"pdf_url":"https://arxiv.org/pdf/2311.12537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05140v2","updated":"2023-11-21T11:28:24Z","published":"2023-10-08T12:21:24Z","title":"Harnessing the Power of Large Language Models for Empathetic Response\n Generation: Empirical Investigations and Improvements","summary":" Empathetic dialogue is an indispensable part of building harmonious social\nrelationships and contributes to the development of a helpful AI. Previous\napproaches are mainly based on fine small-scale language models. With the\nadvent of ChatGPT, the application effect of large language models (LLMs) in\nthis field has attracted great attention. This work empirically investigates\nthe performance of LLMs in generating empathetic responses and proposes three\nimprovement methods of semantically similar in-context learning, two-stage\ninteractive generation, and combination with the knowledge base. Extensive\nexperiments show that LLMs can significantly benefit from our proposed methods\nand is able to achieve state-of-the-art performance in both automatic and human\nevaluations. Additionally, we explore the possibility of GPT-4 simulating human\nevaluators.\n","authors":["Yushan Qian","Wei-Nan Zhang","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05140v2.pdf","comment":"the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.12534v1","updated":"2023-11-21T11:26:26Z","published":"2023-11-21T11:26:26Z","title":"Evaluation Metrics of Language Generation Models for Synthetic Traffic\n Generation Tasks","summary":" Many Natural Language Generation (NLG) tasks aim to generate a single output\ntext given an input prompt. Other settings require the generation of multiple\ntexts, e.g., for Synthetic Traffic Generation (STG). This generation task is\ncrucial for training and evaluating QA systems as well as conversational\nagents, where the goal is to generate multiple questions or utterances\nresembling the linguistic variability of real users. In this paper, we show\nthat common NLG metrics, like BLEU, are not suitable for evaluating STG. We\npropose and evaluate several metrics designed to compare the generated traffic\nto the distribution of real user texts. We validate our metrics with an\nautomatic procedure to verify whether they capture different types of quality\nissues of generated data; we also run human annotations to verify the\ncorrelation with human judgements. Experiments on three tasks, i.e., Shopping\nUtterance Generation, Product Question Generation and Query Auto Completion,\ndemonstrate that our metrics are effective for evaluating STG tasks, and\nimprove the agreement with human judgement up to 20% with respect to common NLG\nmetrics. We believe these findings can pave the way towards better solutions\nfor estimating the representativeness of synthetic text data.\n","authors":["Simone Filice","Jason Ingyu Choi","Giuseppe Castellucci","Eugene Agichtein","Oleg Rokhlenko"],"pdf_url":"https://arxiv.org/pdf/2311.12534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18098v3","updated":"2023-11-21T11:01:24Z","published":"2023-05-29T14:07:52Z","title":"BigTranslate: Augmenting Large Language Models with Multilingual\n Translation Capability over 100 Languages","summary":" Large language models (LLMs) demonstrate promising translation performance\namong various natural languages. However, many LLMs especially the open-sourced\nones, such as BLOOM and LLaMA, are English-dominant and support only dozens of\nnatural languages, making the potential of LLMs on language translation less\nexplored. In this work, we present BigTranslate which adapts LLaMA that covers\nonly 20 languages and enhances it with multilingual translation capability on\nmore than 100 languages. BigTranslate is built upon LLaMA-13B and it is\noptimized in three steps. First, we continue training LLaMA with massive\nChinese monolingual data. Second, we continue training the model with a\nlarge-scale parallel dataset that covers 102 natural languages. Third, we\ninstruct-tune the foundation model with multilingual translation instructions,\nleading to our BigTranslate model. The preliminary experiments on multilingual\ntranslation show that BigTranslate performs comparably with ChatGPT and Google\nTranslate in many languages and even outperforms ChatGPT in 8 language pairs.\nWe release the BigTranslate model and hope it can advance the research\nprogress.\n","authors":["Wen Yang","Chong Li","Jiajun Zhang","Chengqing Zong"],"pdf_url":"https://arxiv.org/pdf/2305.18098v3.pdf","comment":"16 pages, 4 figures. Our model is available at\n https://github.com/ZNLP/BigTranslate"},{"id":"http://arxiv.org/abs/2311.12489v1","updated":"2023-11-21T09:59:29Z","published":"2023-11-21T09:59:29Z","title":"Multilingual Word Embeddings for Low-Resource Languages using Anchors\n and a Chain of Related Languages","summary":" Very low-resource languages, having only a few million tokens worth of data,\nare not well-supported by multilingual NLP approaches due to poor quality\ncross-lingual word representations. Recent work showed that good cross-lingual\nperformance can be achieved if a source language is related to the low-resource\ntarget language. However, not all language pairs are related. In this paper, we\npropose to build multilingual word embeddings (MWEs) via a novel language\nchain-based approach, that incorporates intermediate related languages to\nbridge the gap between the distant source and target. We build MWEs one\nlanguage at a time by starting from the resource rich source and sequentially\nadding each language in the chain till we reach the target. We extend a\nsemi-joint bilingual approach to multiple languages in order to eliminate the\nmain weakness of previous works, i.e., independently trained monolingual\nembeddings, by anchoring the target language around the multilingual space. We\nevaluate our method on bilingual lexicon induction for 4 language families,\ninvolving 4 very low-resource (<5M tokens) and 4 moderately low-resource (<50M)\ntarget languages, showing improved performance in both categories.\nAdditionally, our analysis reveals the importance of good quality embeddings\nfor intermediate languages as well as the importance of leveraging anchor\npoints from all languages in the multilingual space.\n","authors":["Viktor Hangya","Silvia Severini","Radoslav Ralev","Alexander Fraser","Hinrich Schütze"],"pdf_url":"https://arxiv.org/pdf/2311.12489v1.pdf","comment":"Accepted at the MRL 2023 workshop"},{"id":"http://arxiv.org/abs/2310.17940v3","updated":"2023-11-21T09:47:04Z","published":"2023-10-27T07:34:51Z","title":"Unified Segment-to-Segment Framework for Simultaneous Sequence\n Generation","summary":" Simultaneous sequence generation is a pivotal task for real-time scenarios,\nsuch as streaming speech recognition, simultaneous machine translation and\nsimultaneous speech translation, where the target sequence is generated while\nreceiving the source sequence. The crux of achieving high-quality generation\nwith low latency lies in identifying the optimal moments for generating,\naccomplished by learning a mapping between the source and target sequences.\nHowever, existing methods often rely on task-specific heuristics for different\nsequence types, limiting the model's capacity to adaptively learn the\nsource-target mapping and hindering the exploration of multi-task learning for\nvarious simultaneous tasks. In this paper, we propose a unified\nsegment-to-segment framework (Seg2Seg) for simultaneous sequence generation,\nwhich learns the mapping in an adaptive and unified manner. During the process\nof simultaneous generation, the model alternates between waiting for a source\nsegment and generating a target segment, making the segment serve as the\nnatural bridge between the source and target. To accomplish this, Seg2Seg\nintroduces a latent segment as the pivot between source to target and explores\nall potential source-target mappings via the proposed expectation training,\nthereby learning the optimal moments for generating. Experiments on multiple\nsimultaneous generation tasks demonstrate that Seg2Seg achieves\nstate-of-the-art performance and exhibits better generality across various\ntasks.\n","authors":["Shaolei Zhang","Yang Feng"],"pdf_url":"https://arxiv.org/pdf/2310.17940v3.pdf","comment":"Grammatical errors prevent the article from being indexed. This is\n not a problem that can be solved by replacing a new version"},{"id":"http://arxiv.org/abs/2311.12480v1","updated":"2023-11-21T09:44:33Z","published":"2023-11-21T09:44:33Z","title":"Speaker-Adapted End-to-End Visual Speech Recognition for Continuous\n Spanish","summary":" Different studies have shown the importance of visual cues throughout the\nspeech perception process. In fact, the development of audiovisual approaches\nhas led to advances in the field of speech technologies. However, although\nnoticeable results have recently been achieved, visual speech recognition\nremains an open research problem. It is a task in which, by dispensing with the\nauditory sense, challenges such as visual ambiguities and the complexity of\nmodeling silence must be faced. Nonetheless, some of these challenges can be\nalleviated when the problem is approached from a speaker-dependent perspective.\nThus, this paper studies, using the Spanish LIP-RTVE database, how the\nestimation of specialized end-to-end systems for a specific person could affect\nthe quality of speech recognition. First, different adaptation strategies based\non the fine-tuning technique were proposed. Then, a pre-trained CTC/Attention\narchitecture was used as a baseline throughout our experiments. Our findings\nshowed that a two-step fine-tuning process, where the VSR system is first\nadapted to the task domain, provided significant improvements when the speaker\nadaptation was addressed. Furthermore, results comparable to the current state\nof the art were reached even when only a limited amount of data was available.\n","authors":["David Gimeno-Gómez","Carlos-D. Martínez-Hinarejos"],"pdf_url":"https://arxiv.org/pdf/2311.12480v1.pdf","comment":"Accepted in Proceedings of IberSpeech 2022 (\n https://www.isca-speech.org/archive/iberspeech_2022/gimenogomez22_iberspeech.html\n )"},{"id":"http://arxiv.org/abs/2311.12475v1","updated":"2023-11-21T09:37:42Z","published":"2023-11-21T09:37:42Z","title":"PhayaThaiBERT: Enhancing a Pretrained Thai Language Model with\n Unassimilated Loanwords","summary":" While WangchanBERTa has become the de facto standard in transformer-based\nThai language modeling, it still has shortcomings in regard to the\nunderstanding of foreign words, most notably English words, which are often\nborrowed without orthographic assimilation into Thai in many contexts. We\nidentify the lack of foreign vocabulary in WangchanBERTa's tokenizer as the\nmain source of these shortcomings. We then expand WangchanBERTa's vocabulary\nvia vocabulary transfer from XLM-R's pretrained tokenizer and pretrain a new\nmodel using the expanded tokenizer, starting from WangchanBERTa's checkpoint,\non a new dataset that is larger than the one used to train WangchanBERTa. Our\nresults show that our new pretrained model, PhayaThaiBERT, outperforms\nWangchanBERTa in many downstream tasks and datasets.\n","authors":["Panyut Sriwirote","Jalinee Thapiang","Vasan Timtong","Attapol T. Rutherford"],"pdf_url":"https://arxiv.org/pdf/2311.12475v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12474v1","updated":"2023-11-21T09:36:11Z","published":"2023-11-21T09:36:11Z","title":"CSMeD: Bridging the Dataset Gap in Automated Citation Screening for\n Systematic Literature Reviews","summary":" Systematic literature reviews (SLRs) play an essential role in summarising,\nsynthesising and validating scientific evidence. In recent years, there has\nbeen a growing interest in using machine learning techniques to automate the\nidentification of relevant studies for SLRs. However, the lack of standardised\nevaluation datasets makes comparing the performance of such automated\nliterature screening systems difficult. In this paper, we analyse the citation\nscreening evaluation datasets, revealing that many of the available datasets\nare either too small, suffer from data leakage or have limited applicability to\nsystems treating automated literature screening as a classification task, as\nopposed to, for example, a retrieval or question-answering task. To address\nthese challenges, we introduce CSMeD, a meta-dataset consolidating nine\npublicly released collections, providing unified access to 325 SLRs from the\nfields of medicine and computer science. CSMeD serves as a comprehensive\nresource for training and evaluating the performance of automated citation\nscreening models. Additionally, we introduce CSMeD-FT, a new dataset designed\nexplicitly for evaluating the full text publication screening task. To\ndemonstrate the utility of CSMeD, we conduct experiments and establish\nbaselines on new datasets.\n","authors":["Wojciech Kusa","Oscar E. Mendoza","Matthias Samwald","Petr Knoth","Allan Hanbury"],"pdf_url":"https://arxiv.org/pdf/2311.12474v1.pdf","comment":"Accepted at NeurIPS 2023 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2311.12468v1","updated":"2023-11-21T09:28:00Z","published":"2023-11-21T09:28:00Z","title":"Analysis of Visual Features for Continuous Lipreading in Spanish","summary":" During a conversation, our brain is responsible for combining information\nobtained from multiple senses in order to improve our ability to understand the\nmessage we are perceiving. Different studies have shown the importance of\npresenting visual information in these situations. Nevertheless, lipreading is\na complex task whose objective is to interpret speech when audio is not\navailable. By dispensing with a sense as crucial as hearing, it will be\nnecessary to be aware of the challenge that this lack presents. In this paper,\nwe propose an analysis of different speech visual features with the intention\nof identifying which of them is the best approach to capture the nature of lip\nmovements for natural Spanish and, in this way, dealing with the automatic\nvisual speech recognition task. In order to estimate our system, we present an\naudiovisual corpus compiled from a subset of the RTVE database, which has been\nused in the Albayz\\'in evaluations. We employ a traditional system based on\nHidden Markov Models with Gaussian Mixture Models. Results show that, although\nthe task is difficult, in restricted conditions we obtain recognition results\nwhich determine that using eigenlips in combination with deep features is the\nbest visual approach.\n","authors":["David Gimeno-Gómez","Carlos-D. Martínez-Hinarejos"],"pdf_url":"https://arxiv.org/pdf/2311.12468v1.pdf","comment":"Accepted in Proceedings of IberSpeech 2020 (\n https://www.isca-speech.org/archive/iberspeech_2021/gimenogomez21_iberspeech.html\n )"},{"id":"http://arxiv.org/abs/2310.18168v3","updated":"2023-11-21T09:19:03Z","published":"2023-10-27T14:27:43Z","title":"Personas as a Way to Model Truthfulness in Language Models","summary":" Large Language Models (LLMs) are trained on vast amounts of text from the\ninternet, which contains both factual and misleading information about the\nworld. Can language models discern truth from falsehood in this contradicting\ndata? Expanding on the view that LLMs can model different communicative agents,\nwe present the persona hypothesis: LLMs can cluster agents into personas using\ncommon features of their generations. For instance, a truthful persona is a\ngroup of agents that are likely to produce truthful text and that share similar\nfeatures like formal writing styles and scientific references. By modeling this\npersona, LLMs can generalize truthfulness beyond the specific contexts in which\neach agent generated the training text. For example, the model can infer that\nthe agent ``Wikipedia'' will behave truthfully on topics that were only\ngenerated by ``Science'' because they both belong to the truthful persona. We\nshow evidence for the persona hypothesis via two observations: (1) we can probe\nwhether a model's answer will be truthful before it is generated; (2)\nfinetuning a model on a set of facts improves its truthfulness on unseen\ntopics. Next, using arithmetics as a synthetic environment, we show that\nlanguage models can separate true and false statements, and generalize\ntruthfulness across agents; but only if agents in the training data share a\ntruthful generative process that enables the creation of a truthful persona.\nOverall, our findings suggest that models can exploit hierarchical structures\nin the data to learn abstract concepts like truthfulness.\n","authors":["Nitish Joshi","Javier Rando","Abulhair Saparov","Najoung Kim","He He"],"pdf_url":"https://arxiv.org/pdf/2310.18168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12457v1","updated":"2023-11-21T09:12:21Z","published":"2023-11-21T09:12:21Z","title":"LIP-RTVE: An Audiovisual Database for Continuous Spanish in the Wild","summary":" Speech is considered as a multi-modal process where hearing and vision are\ntwo fundamentals pillars. In fact, several studies have demonstrated that the\nrobustness of Automatic Speech Recognition systems can be improved when audio\nand visual cues are combined to represent the nature of speech. In addition,\nVisual Speech Recognition, an open research problem whose purpose is to\ninterpret speech by reading the lips of the speaker, has been a focus of\ninterest in the last decades. Nevertheless, in order to estimate these systems\nin the currently Deep Learning era, large-scale databases are required. On the\nother hand, while most of these databases are dedicated to English, other\nlanguages lack sufficient resources. Thus, this paper presents a\nsemi-automatically annotated audiovisual database to deal with unconstrained\nnatural Spanish, providing 13 hours of data extracted from Spanish television.\nFurthermore, baseline results for both speaker-dependent and\nspeaker-independent scenarios are reported using Hidden Markov Models, a\ntraditional paradigm that has been widely used in the field of Speech\nTechnologies.\n","authors":["David Gimeno-Gómez","Carlos-D. Martínez-Hinarejos"],"pdf_url":"https://arxiv.org/pdf/2311.12457v1.pdf","comment":"Accepted in Proceedings of LREC 2022 (\n https://aclanthology.org/2022.lrec-1.294 )"},{"id":"http://arxiv.org/abs/2311.12420v1","updated":"2023-11-21T08:20:39Z","published":"2023-11-21T08:20:39Z","title":"How Far Have We Gone in Vulnerability Detection Using Large Language\n Models","summary":" As software becomes increasingly complex and prone to vulnerabilities,\nautomated vulnerability detection is critically important, yet challenging.\nGiven the significant successes of Large Language Models (LLMs) in various\ntasks, there is growing anticipation of their efficacy in vulnerability\ndetection. However, a quantitative understanding of their potential in\nvulnerability detection is still missing. To bridge this gap, we introduce a\ncomprehensive vulnerability benchmark VulBench. This benchmark aggregates\nhigh-quality data from a wide range of CTF (Capture-the-Flag) challenges and\nreal-world applications, with annotations for each vulnerable function\ndetailing the vulnerability type and its root cause. Through our experiments\nencompassing 16 LLMs and 6 state-of-the-art (SOTA) deep learning-based models\nand static analyzers, we find that several LLMs outperform traditional deep\nlearning approaches in vulnerability detection, revealing an untapped potential\nin LLMs. This work contributes to the understanding and utilization of LLMs for\nenhanced software security.\n","authors":["Zeyu Gao","Hao Wang","Yuchen Zhou","Wenyu Zhu","Chao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12418v1","updated":"2023-11-21T08:15:01Z","published":"2023-11-21T08:15:01Z","title":"Visual Analytics for Generative Transformer Models","summary":" While transformer-based models have achieved state-of-the-art results in a\nvariety of classification and generation tasks, their black-box nature makes\nthem challenging for interpretability. In this work, we present a novel visual\nanalytical framework to support the analysis of transformer-based generative\nnetworks. In contrast to previous work, which has mainly focused on\nencoder-based models, our framework is one of the first dedicated to supporting\nthe analysis of transformer-based encoder-decoder models and decoder-only\nmodels for generative and classification tasks. Hence, we offer an intuitive\noverview that allows the user to explore different facets of the model through\ninteractive visualization. To demonstrate the feasibility and usefulness of our\nframework, we present three detailed case studies based on real-world NLP\nresearch problems.\n","authors":["Raymond Li","Ruixin Yang","Wen Xiao","Ahmed AbuRaed","Gabriel Murray","Giuseppe Carenini"],"pdf_url":"https://arxiv.org/pdf/2311.12418v1.pdf","comment":"6 pages (reference excluded), 7 figures"},{"id":"http://arxiv.org/abs/2311.12410v1","updated":"2023-11-21T07:56:30Z","published":"2023-11-21T07:56:30Z","title":"nach0: Multimodal Natural and Chemical Languages Foundation Model","summary":" Large Language Models (LLMs) have substantially driven scientific progress in\nvarious domains, and many papers have demonstrated their ability to tackle\ncomplex problems with creative solutions. Our paper introduces a new foundation\nmodel, nach0, capable of solving various chemical and biological tasks:\nbiomedical question answering, named entity recognition, molecular generation,\nmolecular synthesis, attributes prediction, and others. nach0 is a multi-domain\nand multi-task encoder-decoder LLM pre-trained on unlabeled text from\nscientific literature, patents, and molecule strings to incorporate a range of\nchemical and linguistic knowledge. We employed instruction tuning, where\nspecific task-related instructions are utilized to fine-tune nach0 for the\nfinal set of tasks. To train nach0 effectively, we leverage the NeMo framework,\nenabling efficient parallel optimization of both base and large model versions.\nExtensive experiments demonstrate that our model outperforms state-of-the-art\nbaselines on single-domain and cross-domain tasks. Furthermore, it can generate\nhigh-quality outputs in molecular and textual formats, showcasing its\neffectiveness in multi-domain setups.\n","authors":["Micha Livne","Zulfat Miftahutdinov","Elena Tutubalina","Maksim Kuznetsov","Daniil Polykovskiy","Annika Brundyn","Aastha Jhunjhunwala","Anthony Costa","Alex Aliper","Alex Zhavoronkov"],"pdf_url":"https://arxiv.org/pdf/2311.12410v1.pdf","comment":"Submitted to Nature Communications"},{"id":"http://arxiv.org/abs/2310.07161v2","updated":"2023-11-21T07:54:34Z","published":"2023-10-11T03:19:22Z","title":"Psychoacoustic Challenges Of Speech Enhancement On VoIP Platforms","summary":" Within the ambit of VoIP (Voice over Internet Protocol) telecommunications,\nthe complexities introduced by acoustic transformations merit rigorous\nanalysis. This research, rooted in the exploration of proprietary sender-side\ndenoising effects, meticulously evaluates platforms such as Google Meets and\nZoom. The study draws upon the Deep Noise Suppression (DNS) 2020 dataset,\nensuring a structured examination tailored to various denoising settings and\nreceiver interfaces. A methodological novelty is introduced via the Oaxaca\ndecomposition, traditionally an econometric tool, repurposed herein to analyze\nacoustic-phonetic perturbations within VoIP systems. To further ground the\nimplications of these transformations, psychoacoustic metrics, specifically\nPESQ and STOI, were harnessed to furnish a comprehensive understanding of\nspeech alterations. Cumulatively, the insights garnered underscore the\nintricate landscape of VoIP-influenced acoustic dynamics. In addition to the\nprimary findings, a multitude of metrics are reported, extending the research\npurview. Moreover, out-of-domain benchmarking for both time and time-frequency\ndomain speech enhancement models is included, thereby enhancing the depth and\napplicability of this inquiry. Repository:\ngithub.com/deepology/VoIP-DNS-Challenge\n","authors":["Joseph Konan","Ojas Bhargave","Shikhar Agnihotri","Shuo Han","Yunyang Zeng","Ankit Shah","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2310.07161v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12405v1","updated":"2023-11-21T07:50:53Z","published":"2023-11-21T07:50:53Z","title":"IndoRobusta: Towards Robustness Against Diverse Code-Mixed Indonesian\n Local Languages","summary":" Significant progress has been made on Indonesian NLP. Nevertheless,\nexploration of the code-mixing phenomenon in Indonesian is limited, despite\nmany languages being frequently mixed with Indonesian in daily conversation. In\nthis work, we explore code-mixing in Indonesian with four embedded languages,\ni.e., English, Sundanese, Javanese, and Malay; and introduce IndoRobusta, a\nframework to evaluate and improve the code-mixing robustness. Our analysis\nshows that the pre-training corpus bias affects the model's ability to better\nhandle Indonesian-English code-mixing when compared to other local languages,\ndespite having higher language diversity.\n","authors":["Muhammad Farid Adilazuarda","Samuel Cahyawijaya","Genta Indra Winata","Pascale Fung","Ayu Purwarianti"],"pdf_url":"https://arxiv.org/pdf/2311.12405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12404v1","updated":"2023-11-21T07:43:50Z","published":"2023-11-21T07:43:50Z","title":"InterPrompt: Interpretable Prompting for Interrelated Interpersonal Risk\n Factors in Reddit Posts","summary":" Mental health professionals and clinicians have observed the upsurge of\nmental disorders due to Interpersonal Risk Factors (IRFs). To simulate the\nhuman-in-the-loop triaging scenario for early detection of mental health\ndisorders, we recognized textual indications to ascertain these IRFs : Thwarted\nBelongingness (TBe) and Perceived Burdensomeness (PBu) within personal\nnarratives. In light of this, we use N-shot learning with GPT-3 model on the\nIRF dataset, and underscored the importance of fine-tuning GPT-3 model to\nincorporate the context-specific sensitivity and the interconnectedness of\ntextual cues that represent both IRFs.\n In this paper, we introduce an Interpretable Prompting (InterPrompt)} method\nto boost the attention mechanism by fine-tuning the GPT-3 model. This allows a\nmore sophisticated level of language modification by adjusting the pre-trained\nweights. Our model learns to detect usual patterns and underlying connections\nacross both the IRFs, which leads to better system-level explainability and\ntrustworthiness. The results of our research demonstrate that all four variants\nof GPT-3 model, when fine-tuned with InterPrompt, perform considerably better\nas compared to the baseline methods, both in terms of classification and\nexplanation generation.\n","authors":["MSVPJ Sathvik","Surjodeep Sarkar","Chandni Saxena","Sunghwan Sohn","Muskan Garg"],"pdf_url":"https://arxiv.org/pdf/2311.12404v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2311.12399v1","updated":"2023-11-21T07:22:48Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n Directions","summary":" Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v1.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.10777v2","updated":"2023-11-21T07:15:57Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis (ABSA): Domains,\n Methods, and Trends","summary":" Aspect-based Sentiment Analysis (ABSA) is a type of fine-grained sentiment\nanalysis (SA) that identifies aspects and the associated opinions from a given\ntext. In the digital era, ABSA gained increasing popularity and applications in\nmining opinionated text data to obtain insights and support decisions. ABSA\nresearch employs linguistic, statistical, and machine-learning approaches and\nutilises resources such as labelled datasets, aspect and sentiment lexicons and\nontology. By its nature, ABSA is domain-dependent and can be sensitive to the\nimpact of misalignment between the resource and application domains. However,\nto our knowledge, this topic has not been explored by the existing ABSA\nliterature reviews. In this paper, we present a Systematic Literature Review\n(SLR) of ABSA studies with a focus on the research application domain, dataset\ndomain, and the research methods to examine their relationships and identify\ntrends over time. Our results suggest a number of potential systemic issues in\nthe ABSA research literature, including the predominance of the\n``product/service review'' dataset domain among the majority of studies that\ndid not have a specific research application domain, coupled with the\nprevalence of dataset-reliant methods such as supervised machine learning. This\nreview makes a number of unique contributions to the ABSA research field: 1) To\nour knowledge, it is the first SLR that links the research domain, dataset\ndomain, and research method through a systematic perspective; 2) it is one of\nthe largest scoped SLR on ABSA, with 519 eligible studies filtered from 4191\nsearch results without time constraint; and 3) our review methodology adopted\nan innovative automatic filtering process based on PDF-mining, which enhanced\nscreening quality and reliability. Suggestions and our review limitations are\nalso discussed.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12395v1","updated":"2023-11-21T07:11:39Z","published":"2023-11-21T07:11:39Z","title":"Problems of Non-equivalent Words in Technical Translation","summary":" Translating words which do not have equivalent in target language is not easy\nand finding proper equivalent of those words are very important to render\ncorrectly and understandably, the article defines some thoughts and ideas of\nscientists on the common problems of non-equivalent words from English to\nRussian language and includes English and Russian examples and ideas of certain\nscientist. The English language is worldwide spoken and there are 1.35 billion\nEnglish speakers and over 258 million Russian speakers according to the 2021s\nstatistics. Inevitably, these billions of speakers around the world have\nconnection and they may have deal in different criteria. In order to understand\none another they need to have a pure and fully-understood language. These pure\nlanguages understanding directly relates to translation knowledge where\nlinguists and translators need to work and research to eradicate\nmisunderstanding. Misunderstandings mostly appear in non-equivalent words\nbecause there are different local and internal words like food, garment,\ncultural and traditional words and others in every notion. Truly, most of these\nwords do not have equivalent in the target language and these words need to be\nworked and find their equivalent in the target language to fully understand the\nboth languages. However, some of these non-equivalent words are already\nprofessionally rendered to the target language but still there many other words\nto be rendered. Hence, this research paper includes different ways and rules of\nrendering non-equivalent words from source language to the target language.\n","authors":["Mohammad Ibrahim Qani"],"pdf_url":"https://arxiv.org/pdf/2311.12395v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.10770v2","updated":"2023-11-21T06:59:59Z","published":"2023-11-15T18:42:50Z","title":"Exponentially Faster Language Modelling","summary":" Language models only really need to use an exponential fraction of their\nneurons for individual inferences. As proof, we present UltraFastBERT, a BERT\nvariant that uses 0.3% of its neurons during inference while performing on par\nwith similar BERT models. UltraFastBERT selectively engages just 12 out of 4095\nneurons for each layer inference. This is achieved by replacing feedforward\nnetworks with fast feedforward networks (FFFs). While no truly efficient\nimplementation currently exists to unlock the full acceleration potential of\nconditional neural execution, we provide high-level CPU code achieving 78x\nspeedup over the optimized baseline feedforward implementation, and a PyTorch\nimplementation delivering 40x speedup over the equivalent batched feedforward\ninference. We publish our training code, benchmarking setup, and model weights.\n","authors":["Peter Belcak","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2311.10770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12375v1","updated":"2023-11-21T06:27:25Z","published":"2023-11-21T06:27:25Z","title":"The Obscure Limitation of Modular Multilingual Language Models","summary":" We expose the limitation of modular multilingual language models (MLMs) in\nmultilingual inference scenarios with unknown languages. Existing evaluations\nof modular MLMs exclude the involvement of language identification (LID)\nmodules, which obscures the performance of real-case multilingual scenarios of\nmodular MLMs. In this work, we showcase the effect of adding LID on the\nmultilingual evaluation of modular MLMs and provide discussions for closing the\nperformance gap of caused by the pipelined approach of LID and modular MLMs.\n","authors":["Muhammad Farid Adilazuarda","Samuel Cahyawijaya","Ayu Purwarianti"],"pdf_url":"https://arxiv.org/pdf/2311.12375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12373v1","updated":"2023-11-21T06:23:38Z","published":"2023-11-21T06:23:38Z","title":"Beyond Turing: A Comparative Analysis of Approaches for Detecting\n Machine-Generated Text","summary":" Significant progress has been made on text generation by pre-trained language\nmodels (PLMs), yet distinguishing between human and machine-generated text\nposes an escalating challenge. This paper offers an in-depth evaluation of\nthree distinct methods used to address this task: traditional shallow learning,\nLanguage Model (LM) fine-tuning, and Multilingual Model fine-tuning. These\napproaches are rigorously tested on a wide range of machine-generated texts,\nproviding a benchmark of their competence in distinguishing between\nhuman-authored and machine-authored linguistic constructs. The results reveal\nconsiderable differences in performance across methods, thus emphasizing the\ncontinued need for advancement in this crucial area of NLP. This study offers\nvaluable insights and paves the way for future research aimed at creating\nrobust and highly discriminative models.\n","authors":["Muhammad Farid Adilazuarda","Nikolaos Nektarios Arkoulis","Oleksii Chumakov"],"pdf_url":"https://arxiv.org/pdf/2311.12373v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12355v1","updated":"2023-11-21T05:15:56Z","published":"2023-11-21T05:15:56Z","title":"Utilizing Language Models for Tour Itinerary Recommendation","summary":" Tour itinerary recommendation involves planning a sequence of relevant\nPoint-of-Interest (POIs), which combines challenges from the fields of both\nOperations Research (OR) and Recommendation Systems (RS). As an OR problem,\nthere is the need to maximize a certain utility (e.g., popularity of POIs in\nthe tour) while adhering to some constraints (e.g., maximum time for the tour).\nAs a RS problem, it is heavily related to problem or filtering or ranking a\nsubset of POIs that are relevant to a user and recommending it as part of an\nitinerary. In this paper, we explore the use of language models for the task of\ntour itinerary recommendation and planning. This task has the unique\nrequirement of recommending personalized POIs relevant to users and planning\nthese POIs as an itinerary that satisfies various constraints. We discuss some\napproaches in this area, such as using word embedding techniques like Word2Vec\nand GloVe for learning POI embeddings and transformer-based techniques like\nBERT for generating\n itineraries.\n","authors":["Ngai Lam Ho","Kwan Hui Lim"],"pdf_url":"https://arxiv.org/pdf/2311.12355v1.pdf","comment":"PMAI23 @IJCAI 2023 2nd International Workshop on Process Management\n in the AI era"},{"id":"http://arxiv.org/abs/2309.17453v2","updated":"2023-11-21T05:04:49Z","published":"2023-09-29T17:59:56Z","title":"Efficient Streaming Language Models with Attention Sinks","summary":" Deploying Large Language Models (LLMs) in streaming applications such as\nmulti-round dialogue, where long interactions are expected, is urgently needed\nbut poses two major challenges. Firstly, during the decoding stage, caching\nprevious tokens' Key and Value states (KV) consumes extensive memory. Secondly,\npopular LLMs cannot generalize to longer texts than the training sequence\nlength. Window attention, where only the most recent KVs are cached, is a\nnatural approach -- but we show that it fails when the text length surpasses\nthe cache size. We observe an interesting phenomenon, namely attention sink,\nthat keeping the KV of initial tokens will largely recover the performance of\nwindow attention. In this paper, we first demonstrate that the emergence of\nattention sink is due to the strong attention scores towards initial tokens as\na ``sink'' even if they are not semantically important. Based on the above\nanalysis, we introduce StreamingLLM, an efficient framework that enables LLMs\ntrained with a finite length attention window to generalize to infinite\nsequence lengths without any fine-tuning. We show that StreamingLLM can enable\nLlama-2, MPT, Falcon, and Pythia to perform stable and efficient language\nmodeling with up to 4 million tokens and more. In addition, we discover that\nadding a placeholder token as a dedicated attention sink during pre-training\ncan further improve streaming deployment. In streaming settings, StreamingLLM\noutperforms the sliding window recomputation baseline by up to 22.2x speedup.\nCode and datasets are provided at https://github.com/mit-han-lab/streaming-llm.\n","authors":["Guangxuan Xiao","Yuandong Tian","Beidi Chen","Song Han","Mike Lewis"],"pdf_url":"https://arxiv.org/pdf/2309.17453v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12351v1","updated":"2023-11-21T04:59:17Z","published":"2023-11-21T04:59:17Z","title":"Advancing Transformer Architecture in Long-Context Large Language\n Models: A Comprehensive Survey","summary":" With the bomb ignited by ChatGPT, Transformer-based Large Language Models\n(LLMs) have paved a revolutionary path toward Artificial General Intelligence\n(AGI) and have been applied in diverse areas as knowledge bases, human\ninterfaces, and dynamic agents. However, a prevailing limitation exists: many\ncurrent LLMs, constrained by resources, are primarily pre-trained on shorter\ntexts, rendering them less effective for longer-context prompts, commonly\nencountered in real-world settings. In this paper, we present a comprehensive\nsurvey focusing on the advancement of model architecture in Transformer-based\nLLMs to optimize long-context capabilities across all stages from pre-training\nto inference. We firstly delineate and analyze the problems of handling\nlong-context input and output with the current Transformer-based models. Then,\nwe mainly offer a holistic taxonomy to navigate the landscape of Transformer\nupgrades on architecture to solve these problems. Afterward, we provide the\ninvestigation on wildly used evaluation necessities tailored for long-context\nLLMs, including datasets, metrics, and baseline models, as well as some amazing\noptimization toolkits like libraries, systems, and compilers to augment LLMs'\nefficiency and efficacy across different stages. Finally, we further discuss\nthe predominant challenges and potential avenues for future research in this\ndomain. Additionally, we have established a repository where we curate relevant\nliterature with real-time updates at\nhttps://github.com/Strivin0311/long-llms-learning.\n","authors":["Yunpeng Huang","Jingwei Xu","Zixu Jiang","Junyu Lai","Zenan Li","Yuan Yao","Taolue Chen","Lijuan Yang","Zhou Xin","Xiaoxing Ma"],"pdf_url":"https://arxiv.org/pdf/2311.12351v1.pdf","comment":"35 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.12337v1","updated":"2023-11-21T04:06:08Z","published":"2023-11-21T04:06:08Z","title":"Do Smaller Language Models Answer Contextualised Questions Through\n Memorisation Or Generalisation?","summary":" A distinction is often drawn between a model's ability to predict a label for\nan evaluation sample that is directly memorised from highly similar training\nsamples versus an ability to predict the label via some method of\ngeneralisation. In the context of using Language Models for question-answering,\ndiscussion continues to occur as to the extent to which questions are answered\nthrough memorisation. We consider this issue for questions that would ideally\nbe answered through reasoning over an associated context. We propose a method\nof identifying evaluation samples for which it is very unlikely our model would\nhave memorised the answers. Our method is based on semantic similarity of input\ntokens and label tokens between training and evaluation samples. We show that\nour method offers advantages upon some prior approaches in that it is able to\nsurface evaluation-train pairs that have overlap in either contiguous or\ndiscontiguous sequences of tokens. We use this method to identify unmemorisable\nsubsets of our evaluation datasets. We train two Language Models in a multitask\nfashion whereby the second model differs from the first only in that it has two\nadditional datasets added to the training regime that are designed to impart\nsimple numerical reasoning strategies of a sort known to improve performance on\nsome of our evaluation datasets but not on others. We then show that there is\nperformance improvement between the two models on the unmemorisable subsets of\nthe evaluation datasets that were expected to benefit from the additional\ntraining datasets. Specifically, performance on unmemorisable subsets of two of\nour evaluation datasets, DROP and ROPES significantly improves by 9.0%, and\n25.7% respectively while other evaluation datasets have no significant change\nin performance.\n","authors":["Tim Hartill","Joshua Bensemann","Michael Witbrock","Patricia J. Riddle"],"pdf_url":"https://arxiv.org/pdf/2311.12337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12323v1","updated":"2023-11-21T03:34:20Z","published":"2023-11-21T03:34:20Z","title":"Modeling Political Orientation of Social Media Posts: An Extended\n Analysis","summary":" Developing machine learning models to characterize political polarization on\nonline social media presents significant challenges. These challenges mainly\nstem from various factors such as the lack of annotated data, presence of noise\nin social media datasets, and the sheer volume of data. The common research\npractice typically examines the biased structure of online user communities for\na given topic or qualitatively measuring the impacts of polarized topics on\nsocial media. However, there is limited work focusing on analyzing polarization\nat the ground-level, specifically in the social media posts themselves. Such\nexisting analysis heavily relies on annotated data, which often requires\nlaborious human labeling, offers labels only to specific problems, and lacks\nthe ability to determine the near-future bias state of a social media\nconversations. Understanding the degree of political orientation conveyed in\nsocial media posts is crucial for quantifying the bias of online user\ncommunities and investigating the spread of polarized content. In this work, we\nfirst introduce two heuristic methods that leverage on news media bias and post\ncontent to label social media posts. Next, we compare the efficacy and quality\nof heuristically labeled dataset with a randomly sampled human-annotated\ndataset. Additionally, we demonstrate that current machine learning models can\nexhibit improved performance in predicting political orientation of social\nmedia posts, employing both traditional supervised learning and few-shot\nlearning setups. We conduct experiments using the proposed heuristic methods\nand machine learning approaches to predict the political orientation of posts\ncollected from two social media forums with diverse political ideologies: Gab\nand Twitter.\n","authors":["Sadia Kamal","Brenner Little","Jade Gullic","Trevor Harms","Kristin Olofsson","Arunkumar Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2311.12323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12315v1","updated":"2023-11-21T03:17:14Z","published":"2023-11-21T03:17:14Z","title":"AcademicGPT: Empowering Academic Research","summary":" Large Language Models (LLMs) have demonstrated exceptional capabilities\nacross various natural language processing tasks. Yet, many of these advanced\nLLMs are tailored for broad, general-purpose applications. In this technical\nreport, we introduce AcademicGPT, designed specifically to empower academic\nresearch. AcademicGPT is a continual training model derived from LLaMA2-70B.\nOur training corpus mainly consists of academic papers, thesis, content from\nsome academic domain, high-quality Chinese data and others. While it may not be\nextensive in data scale, AcademicGPT marks our initial venture into a\ndomain-specific GPT tailored for research area. We evaluate AcademicGPT on\nseveral established public benchmarks such as MMLU and CEval, as well as on\nsome specialized academic benchmarks like PubMedQA, SCIEval, and our\nnewly-created ComputerScienceQA, to demonstrate its ability from general\nknowledge ability, to Chinese ability, and to academic ability. Building upon\nAcademicGPT's foundation model, we also developed several applications catered\nto the academic area, including General Academic Question Answering,\nAI-assisted Paper Reading, Paper Review, and AI-assisted Title and Abstract\nGeneration.\n","authors":["Shufa Wei","Xiaolong Xu","Xianbiao Qi","Xi Yin","Jun Xia","Jingyi Ren","Peijun Tang","Yuxiang Zhong","Yihao Chen","Xiaoqin Ren","Yuxin Liang","Liankai Huang","Kai Xie","Weikang Gui","Wei Tan","Shuanglong Sun","Yongquan Hu","Qinxian Liu","Nanjin Li","Chihao Dai","Lihua Wang","Xiaohui Liu","Lei Zhang","Yutao Xie"],"pdf_url":"https://arxiv.org/pdf/2311.12315v1.pdf","comment":"Technical Report. arXiv admin note: text overlap with\n arXiv:2310.12081, arXiv:2310.10053 by other authors"},{"id":"http://arxiv.org/abs/2304.03898v2","updated":"2023-11-21T02:39:06Z","published":"2023-04-08T03:24:05Z","title":"The Short Text Matching Model Enhanced with Knowledge via Contrastive\n Learning","summary":" In recent years, short Text Matching tasks have been widely applied in the\nfields ofadvertising search and recommendation. The difficulty lies in the lack\nof semantic information and word ambiguity caused by the short length of the\ntext. Previous works have introduced complement sentences or knowledge bases to\nprovide additional feature information. However, these methods have not fully\ninteracted between the original sentence and the complement sentence, and have\nnot considered the noise issue that may arise from the introduction of external\nknowledge bases. Therefore, this paper proposes a short Text Matching model\nthat combines contrastive learning and external knowledge. The model uses a\ngenerative model to generate corresponding complement sentences and uses the\ncontrastive learning method to guide the model to obtain more semantically\nmeaningful encoding of the original sentence. In addition, to avoid noise, we\nuse keywords as the main semantics of the original sentence to retrieve\ncorresponding knowledge words in the knowledge base, and construct a knowledge\ngraph. The graph encoding model is used to integrate the knowledge base\ninformation into the model. Our designed model achieves state-of-the-art\nperformance on two publicly available Chinese Text Matching datasets,\ndemonstrating the effectiveness of our model.\n","authors":["Ruiqiang Liu","Mengmeng Cui","Hanjie Mai","Qiang Zhang","Shaohua Xu","Xiangzheng Liu","Yanlong Du"],"pdf_url":"https://arxiv.org/pdf/2304.03898v2.pdf","comment":"11 pages,2 figures"},{"id":"http://arxiv.org/abs/2311.12298v1","updated":"2023-11-21T02:35:09Z","published":"2023-11-21T02:35:09Z","title":"Noise in Relation Classification Dataset TACRED: Characterization and\n Reduction","summary":" The overarching objective of this paper is two-fold. First, to explore\nmodel-based approaches to characterize the primary cause of the noise. in the\nRE dataset TACRED Second, to identify the potentially noisy instances. Towards\nthe first objective, we analyze predictions and performance of state-of-the-art\n(SOTA) models to identify the root cause of noise in the dataset. Our analysis\nof TACRED shows that the majority of the noise in the dataset originates from\nthe instances labeled as no-relation which are negative examples. For the\nsecond objective, we explore two nearest-neighbor-based strategies to\nautomatically identify potentially noisy examples for elimination and\nreannotation. Our first strategy, referred to as Intrinsic Strategy (IS), is\nbased on the assumption that positive examples are clean. Thus, we have used\nfalse-negative predictions to identify noisy negative examples. Whereas, our\nsecond approach, referred to as Extrinsic Strategy, is based on using a clean\nsubset of the dataset to identify potentially noisy negative examples. Finally,\nwe retrained the SOTA models on the eliminated and reannotated dataset. Our\nempirical results based on two SOTA models trained on TACRED-E following the IS\nshow an average 4% F1-score improvement, whereas reannotation (TACRED-R) does\nnot improve the original results. However, following ES, SOTA models show the\naverage F1-score improvement of 3.8% and 4.4% when trained on respective\neliminated (TACRED-EN) and reannotated (TACRED-RN) datasets respectively. We\nfurther extended the ES for cleaning positive examples as well, which resulted\nin an average performance improvement of 5.8% and 5.6% for the eliminated\n(TACRED-ENP) and reannotated (TACRED-RNP) datasets respectively.\n","authors":["Akshay Parekh","Ashish Anand","Amit Awekar"],"pdf_url":"https://arxiv.org/pdf/2311.12298v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2311.10899v2","updated":"2023-11-21T02:16:27Z","published":"2023-11-17T22:44:05Z","title":"Extraction and Summarization of Explicit Video Content using Multi-Modal\n Deep Learning","summary":" With the increase in video-sharing platforms across the internet, it is\ndifficult for humans to moderate the data for explicit content. Hence, an\nautomated pipeline to scan through video data for explicit content has become\nthe need of the hour. We propose a novel pipeline that uses multi-modal deep\nlearning to first extract the explicit segments of input videos and then\nsummarize their content using text to determine its age appropriateness and age\nrating. We also evaluate our pipeline's effectiveness in the end using standard\nmetrics.\n","authors":["Shaunak Joshi","Raghav Gaggar"],"pdf_url":"https://arxiv.org/pdf/2311.10899v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.12289v1","updated":"2023-11-21T02:02:46Z","published":"2023-11-21T02:02:46Z","title":"ATLANTIC: Structure-Aware Retrieval-Augmented Language Model for\n Interdisciplinary Science","summary":" Large language models record impressive performance on many natural language\nprocessing tasks. However, their knowledge capacity is limited to the\npretraining corpus. Retrieval augmentation offers an effective solution by\nretrieving context from external knowledge sources to complement the language\nmodel. However, existing retrieval augmentation techniques ignore the\nstructural relationships between these documents. Furthermore, retrieval models\nare not explored much in scientific tasks, especially in regard to the\nfaithfulness of retrieved documents. In this paper, we propose a novel\nstructure-aware retrieval augmented language model that accommodates document\nstructure during retrieval augmentation. We create a heterogeneous document\ngraph capturing multiple types of relationships (e.g., citation, co-authorship,\netc.) that connect documents from more than 15 scientific disciplines (e.g.,\nPhysics, Medicine, Chemistry, etc.). We train a graph neural network on the\ncurated document graph to act as a structural encoder for the corresponding\npassages retrieved during the model pretraining. Particularly, along with text\nembeddings of the retrieved passages, we obtain structural embeddings of the\ndocuments (passages) and fuse them together before feeding them to the language\nmodel. We evaluate our model extensively on various scientific benchmarks that\ninclude science question-answering and scientific document classification\ntasks. Experimental results demonstrate that structure-aware retrieval improves\nretrieving more coherent, faithful and contextually relevant passages, while\nshowing a comparable performance in the overall accuracy.\n","authors":["Sai Munikoti","Anurag Acharya","Sridevi Wagle","Sameera Horawalavithana"],"pdf_url":"https://arxiv.org/pdf/2311.12289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10429v4","updated":"2023-11-21T02:01:53Z","published":"2023-05-17T17:58:13Z","title":"DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining","summary":" The mixture proportions of pretraining data domains (e.g., Wikipedia, books,\nweb text) greatly affect language model (LM) performance. In this paper, we\npropose Domain Reweighting with Minimax Optimization (DoReMi), which first\ntrains a small proxy model using group distributionally robust optimization\n(Group DRO) over domains to produce domain weights (mixture proportions)\nwithout knowledge of downstream tasks. We then resample a dataset with these\ndomain weights and train a larger, full-sized model. In our experiments, we use\nDoReMi on a 280M-parameter proxy model to set the domain weights for training\nan 8B-parameter model (30x larger) more efficiently. On The Pile, DoReMi\nimproves perplexity across all domains, even when it downweights a domain.\nDoReMi improves average few-shot downstream accuracy by 6.5% points over a\nbaseline model trained using The Pile's default domain weights and reaches the\nbaseline accuracy with 2.6x fewer training steps. On the GLaM dataset, DoReMi,\nwhich has no knowledge of downstream tasks, even matches the performance of\nusing domain weights tuned on downstream tasks.\n","authors":["Sang Michael Xie","Hieu Pham","Xuanyi Dong","Nan Du","Hanxiao Liu","Yifeng Lu","Percy Liang","Quoc V. Le","Tengyu Ma","Adams Wei Yu"],"pdf_url":"https://arxiv.org/pdf/2305.10429v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.12275v1","updated":"2023-11-21T01:34:02Z","published":"2023-11-21T01:34:02Z","title":"Enabling On-Device Large Language Model Personalization with\n Self-Supervised Data Selection and Synthesis","summary":" After a large language model (LLM) is deployed on edge devices, it is\ndesirable for these devices to learn from user-generated conversation data to\ngenerate user-specific and personalized responses in real-time. However,\nuser-generated data usually contains sensitive and private information, and\nuploading such data to the cloud for annotation is not preferred if not\nprohibited. While it is possible to obtain annotation locally by directly\nasking users to provide preferred responses, such annotations have to be sparse\nto not affect user experience. In addition, the storage of edge devices is\nusually too limited to enable large-scale fine-tuning with full user-generated\ndata. It remains an open question how to enable on-device LLM personalization,\nconsidering sparse annotation and limited on-device storage. In this paper, we\npropose a novel framework to select and store the most representative data\nonline in a self-supervised way. Such data has a small memory footprint and\nallows infrequent requests of user annotations for further fine-tuning. To\nenhance fine-tuning quality, multiple semantically similar pairs of question\ntexts and expected responses are generated using the LLM. Our experiments show\nthat the proposed framework achieves the best user-specific content-generating\ncapability (accuracy) and fine-tuning speed (performance) compared with vanilla\nbaselines. To the best of our knowledge, this is the very first on-device LLM\npersonalization framework.\n","authors":["Ruiyang Qin","Jun Xia","Zhenge Jia","Meng Jiang","Ahmed Abbasi","Peipei Zhou","Jingtong Hu","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2311.12275v1.pdf","comment":"6 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.13061v1","updated":"2023-11-21T23:50:33Z","published":"2023-11-21T23:50:33Z","title":"Attribution and Alignment: Effects of Local Context Repetition on\n Utterance Production and Comprehension in Dialogue","summary":" Language models are often used as the backbone of modern dialogue systems.\nThese models are pre-trained on large amounts of written fluent language.\nRepetition is typically penalised when evaluating language model generations.\nHowever, it is a key component of dialogue. Humans use local and partner\nspecific repetitions; these are preferred by human users and lead to more\nsuccessful communication in dialogue. In this study, we evaluate (a) whether\nlanguage models produce human-like levels of repetition in dialogue, and (b)\nwhat are the processing mechanisms related to lexical re-use they use during\ncomprehension. We believe that such joint analysis of model production and\ncomprehension behaviour can inform the development of cognitively inspired\ndialogue generation systems.\n","authors":["Aron Molnar","Jaap Jumelet","Mario Giulianelli","Arabella Sinclair"],"pdf_url":"https://arxiv.org/pdf/2311.13061v1.pdf","comment":"CoNLL 2023"},{"id":"http://arxiv.org/abs/2311.13053v1","updated":"2023-11-21T23:26:05Z","published":"2023-11-21T23:26:05Z","title":"Beyond Text: Unveiling Multimodal Proficiency of Large Language Models\n with MultiAPI Benchmark","summary":" The proliferation of Large Language Models like ChatGPT has significantly\nadvanced language understanding and generation, impacting a broad spectrum of\napplications. However, these models predominantly excel in text-based tasks,\noverlooking the complexity of real-world multimodal information. This study\nintroduces MultiAPI, a pioneering comprehensive large-scale API benchmark\ndataset aimed at expanding LLMs' proficiency in multimodal contexts. Developed\ncollaboratively through ChatGPT, MultiAPI consists of 235 diverse API calls and\n2,038 contextual prompts, offering a unique platform evaluation of\ntool-augmented LLMs handling multimodal tasks. Through comprehensive\nexperiments, our findings reveal that while LLMs demonstrate proficiency in API\ncall decision-making, they face challenges in domain identification, function\nselection, and argument generation. What's more, we surprisingly notice that\nauxiliary context can actually impair the performance. An in-depth error\nanalysis paves the way for a new paradigm to address these challenges,\nsuggesting a potential direction for future LLM research.\n","authors":["Xiao Liu","Jianfeng Lin","Jiawei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13053v1.pdf","comment":"Work in Progress"},{"id":"http://arxiv.org/abs/2211.08371v3","updated":"2023-11-21T23:04:53Z","published":"2022-11-15T18:21:46Z","title":"Pragmatics in Language Grounding: Phenomena, Tasks, and Modeling\n Approaches","summary":" People rely heavily on context to enrich meaning beyond what is literally\nsaid, enabling concise but effective communication. To interact successfully\nand naturally with people, user-facing artificial intelligence systems will\nrequire similar skills in pragmatics: relying on various types of context --\nfrom shared linguistic goals and conventions, to the visual and embodied world\n-- to use language effectively. We survey existing grounded settings and\npragmatic modeling approaches and analyze how the task goals, environmental\ncontexts, and communicative affordances in each work enrich linguistic meaning.\nWe present recommendations for future grounded task design to naturally elicit\npragmatic phenomena, and suggest directions that focus on a broader range of\ncommunicative contexts and affordances.\n","authors":["Daniel Fried","Nicholas Tomlin","Jennifer Hu","Roma Patel","Aida Nematzadeh"],"pdf_url":"https://arxiv.org/pdf/2211.08371v3.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.13029v1","updated":"2023-11-21T22:30:37Z","published":"2023-11-21T22:30:37Z","title":"Systematic word meta-sense extension","summary":" The meaning of polysemous words often varies in a highly productive yet\npredictable way. Generalizing the regularity between conventional senses to\nderive novel word meaning is crucial for automated processing of non-literal\nlanguage uses such as figurative expressions. We introduce a novel task called\nsystematic word meta-sense extension (SWORME) to test and improve language\nmodels' ability to extend word meaning to denote new semantic domains (also\ncalled meta-senses) that bear regular semantic relations with existing senses.\nWe found that language models prefer incremental lexical semantic change toward\nconceptually similar meta-senses such as logical metonymy, and are much worse\nat predicting highly non-literal meaning extensions such as metaphors. We\npropose a novel analogy-based method of word meaning extension, and show that\nit effectively improves language model systematicity in making both gradual and\nradical types of meta-sense extension. We further demonstrate that learning\nsystematic meta-sense extensions benefits language models on multiple\nbenchmarks of figurative language understanding.\n","authors":["Lei Yu"],"pdf_url":"https://arxiv.org/pdf/2311.13029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11603v3","updated":"2023-11-21T22:23:43Z","published":"2022-05-23T20:00:22Z","title":"Representation Projection Invariance Mitigates Representation Collapse","summary":" Fine-tuning contextualized representations learned by pre-trained language\nmodels remains a prevalent practice in NLP. However, fine-tuning can lead to\nrepresentation degradation (also known as representation collapse), which may\nresult in instability, sub-optimal performance, and weak generalization.\n In this paper, we propose Representation Projection Invariance (REPINA), a\nnovel regularization method to maintain the information content of\nrepresentation and reduce representation collapse during fine-tuning by\ndiscouraging undesirable changes in the representations. We study the empirical\nbehavior of the proposed regularization in comparison to 5 comparable baselines\nacross 13 language understanding tasks (GLUE benchmark and six additional\ndatasets). When evaluating in-domain performance, REPINA consistently\noutperforms other baselines on most tasks (10 out of 13). We also demonstrate\nits effectiveness in few-shot settings and robustness to label perturbation. As\na by-product, we extend previous studies of representation collapse and propose\nseveral metrics to quantify it. Our empirical findings show that our approach\nis significantly more effective at mitigating representation collapse.\n","authors":["Anastasia Razdaibiedina","Ashish Khetan","Zohar Karnin","Daniel Khashabi","Vishaal Kapoor","Vivek Madan"],"pdf_url":"https://arxiv.org/pdf/2205.11603v3.pdf","comment":"41 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.12986v1","updated":"2023-11-21T20:45:55Z","published":"2023-11-21T20:45:55Z","title":"Unsupervised Graph Attention Autoencoder for Attributed Networks using\n K-means Loss","summary":" Multimodal Sentiment Analysis (MSA) has recently become a centric research\ndirection for many real-world applications. This proliferation is due to the\nfact that opinions are central to almost all human activities and are key\ninfluencers of our behaviors. In addition, the recent deployment of Deep\nLearning-based (DL) models has proven their high efficiency for a wide range of\nWestern languages. In contrast, Arabic DL-based multimodal sentiment analysis\n(MSA) is still in its infantile stage due, mainly, to the lack of standard\ndatasets. % The contribution In this paper, our investigation is twofold.\nFirst, we design a pipeline that helps building our Arabic Multimodal dataset\nleveraging both state-of-the-art transformers and feature extraction tools\nwithin word alignment techniques. Thereafter, we validate our dataset using\nstate-of-the-art transformer-based model dealing with multimodality. Despite\nthe small size of the outcome dataset, experiments show that Arabic\nmultimodality is very promising.\n","authors":["Abdelfateh Bekkaira","Slimane Bellaouar","Slimane Oulad-Naoui"],"pdf_url":"https://arxiv.org/pdf/2311.12986v1.pdf","comment":"7 pages, 5 Figures"},{"id":"http://arxiv.org/abs/2305.14457v3","updated":"2023-11-21T20:38:22Z","published":"2023-05-23T18:28:42Z","title":"Pre-training Language Models for Comparative Reasoning","summary":" Comparative reasoning is a process of comparing objects, concepts, or\nentities to draw conclusions, which constitutes a fundamental cognitive\nability. In this paper, we propose a novel framework to pre-train language\nmodels for enhancing their abilities of comparative reasoning over texts. While\nthere have been approaches for NLP tasks that require comparative reasoning,\nthey suffer from costly manual data labeling and limited generalizability to\ndifferent tasks. Our approach introduces a novel method of collecting scalable\ndata for text-based entity comparison, which leverages both structured and\nunstructured data. Moreover, we present a framework of pre-training language\nmodels via three novel objectives on comparative reasoning. Evaluation on\ndownstream tasks including comparative question answering, question generation,\nand summarization shows that our pre-training framework significantly improves\nthe comparative reasoning abilities of language models, especially under\nlow-resource conditions. This work also releases the first integrated benchmark\nfor comparative reasoning.\n","authors":["Mengxia Yu","Zhihan Zhang","Wenhao Yu","Meng Jiang"],"pdf_url":"https://arxiv.org/pdf/2305.14457v3.pdf","comment":"EMNLP 2023 - Camera Ready"},{"id":"http://arxiv.org/abs/2311.12983v1","updated":"2023-11-21T20:34:47Z","published":"2023-11-21T20:34:47Z","title":"GAIA: a benchmark for General AI Assistants","summary":" We introduce GAIA, a benchmark for General AI Assistants that, if solved,\nwould represent a milestone in AI research. GAIA proposes real-world questions\nthat require a set of fundamental abilities such as reasoning, multi-modality\nhandling, web browsing, and generally tool-use proficiency. GAIA questions are\nconceptually simple for humans yet challenging for most advanced AIs: we show\nthat human respondents obtain 92\\% vs. 15\\% for GPT-4 equipped with plugins.\nThis notable performance disparity contrasts with the recent trend of LLMs\noutperforming humans on tasks requiring professional skills in e.g. law or\nchemistry. GAIA's philosophy departs from the current trend in AI benchmarks\nsuggesting to target tasks that are ever more difficult for humans. We posit\nthat the advent of Artificial General Intelligence (AGI) hinges on a system's\ncapability to exhibit similar robustness as the average human does on such\nquestions. Using GAIA's methodology, we devise 466 questions and their answer.\nWe release our questions while retaining answers to 300 of them to power a\nleader-board available at https://huggingface.co/gaia-benchmark.\n","authors":["Grégoire Mialon","Clémentine Fourrier","Craig Swift","Thomas Wolf","Yann LeCun","Thomas Scialom"],"pdf_url":"https://arxiv.org/pdf/2311.12983v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09832v3","updated":"2023-11-21T20:30:00Z","published":"2023-10-15T13:28:42Z","title":"Merging Experts into One: Improving Computational Efficiency of Mixture\n of Experts","summary":" Scaling the size of language models usually leads to remarkable advancements\nin NLP tasks. But it often comes with a price of growing computational cost.\nAlthough a sparse Mixture of Experts (MoE) can reduce the cost by activating a\nsmall subset of parameters (e.g., one expert) for each input, its computation\nescalates significantly if increasing the number of activated experts, limiting\nits practical utility. Can we retain the advantages of adding more experts\nwithout substantially increasing the computational costs? In this paper, we\nfirst demonstrate the superiority of selecting multiple experts and then\npropose a computation-efficient approach called \\textbf{\\texttt{Merging Experts\ninto One}} (MEO), which reduces the computation cost to that of a single\nexpert. Extensive experiments show that MEO significantly improves\ncomputational efficiency, e.g., FLOPS drops from 72.0G of vanilla MoE to 28.6G\n(MEO). Moreover, we propose a token-level attention block that further enhances\nthe efficiency and performance of token-level MEO, e.g., 83.3\\% (MEO) vs.\n82.6\\% (vanilla MoE) average score on the GLUE benchmark. Our code will be\nreleased upon acceptance. Code will be released at:\n\\url{https://github.com/Shwai-He/MEO}.\n","authors":["Shwai He","Run-Ze Fan","Liang Ding","Li Shen","Tianyi Zhou","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2310.09832v3.pdf","comment":"EMNLP 2023 Main Conference (Oral)"},{"id":"http://arxiv.org/abs/2305.11731v2","updated":"2023-11-21T19:19:10Z","published":"2023-05-19T15:05:39Z","title":"Persian Typographical Error Type Detection Using Deep Neural Networks on\n Algorithmically-Generated Misspellings","summary":" Spelling correction is a remarkable challenge in the field of natural\nlanguage processing. The objective of spelling correction tasks is to recognize\nand rectify spelling errors automatically. The development of applications that\ncan effectually diagnose and correct Persian spelling and grammatical errors\nhas become more important in order to improve the quality of Persian text. The\nTypographical Error Type Detection in Persian is a relatively understudied\narea. Therefore, this paper presents a compelling approach for detecting\ntypographical errors in Persian texts. Our work includes the presentation of a\npublicly available dataset called FarsTypo, which comprises 3.4 million words\narranged in chronological order and tagged with their corresponding\npart-of-speech. These words cover a wide range of topics and linguistic styles.\nWe develop an algorithm designed to apply Persian-specific errors to a scalable\nportion of these words, resulting in a parallel dataset of correct and\nincorrect words. By leveraging FarsTypo, we establish a strong foundation and\nconduct a thorough comparison of various methodologies employing different\narchitectures. Additionally, we introduce a groundbreaking Deep Sequential\nNeural Network that utilizes both word and character embeddings, along with\nbidirectional LSTM layers, for token classification aimed at detecting\ntypographical errors across 51 distinct classes. Our approach is contrasted\nwith highly advanced industrial systems that, unlike this study, have been\ndeveloped using a diverse range of resources. The outcomes of our final method\nproved to be highly competitive, achieving an accuracy of 97.62%, precision of\n98.83%, recall of 98.61%, and surpassing others in terms of speed.\n","authors":["Mohammad Dehghani","Heshaam Faili"],"pdf_url":"https://arxiv.org/pdf/2305.11731v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10813v2","updated":"2023-11-21T01:24:36Z","published":"2023-11-17T18:59:56Z","title":"A Language Agent for Autonomous Driving","summary":" Human-level driving is an ultimate goal of autonomous driving. Conventional\napproaches formulate autonomous driving as a perception-prediction-planning\nframework, yet their systems do not capitalize on the inherent reasoning\nability and experiential knowledge of humans. In this paper, we propose a\nfundamental paradigm shift from current pipelines, exploiting Large Language\nModels (LLMs) as a cognitive agent to integrate human-like intelligence into\nautonomous driving systems. Our approach, termed Agent-Driver, transforms the\ntraditional autonomous driving pipeline by introducing a versatile tool library\naccessible via function calls, a cognitive memory of common sense and\nexperiential knowledge for decision-making, and a reasoning engine capable of\nchain-of-thought reasoning, task planning, motion planning, and\nself-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive\ncommon sense and robust reasoning capabilities, thus enabling a more nuanced,\nhuman-like approach to autonomous driving. We evaluate our approach on the\nlarge-scale nuScenes benchmark, and extensive experiments substantiate that our\nAgent-Driver significantly outperforms the state-of-the-art driving methods by\na large margin. Our approach also demonstrates superior interpretability and\nfew-shot learning ability to these methods. Code will be released.\n","authors":["Jiageng Mao","Junjie Ye","Yuxi Qian","Marco Pavone","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2311.10813v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.12796v1","updated":"2023-11-21T18:59:58Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12793v1","updated":"2023-11-21T18:58:11Z","published":"2023-11-21T18:58:11Z","title":"ShareGPT4V: Improving Large Multi-Modal Models with Better Captions","summary":" In the realm of large multi-modal models (LMMs), efficient modality alignment\nis crucial yet often constrained by the scarcity of high-quality image-text\ndata. To address this bottleneck, we introduce the ShareGPT4V dataset, a\npioneering large-scale resource featuring 1.2 million highly descriptive\ncaptions, which surpasses existing datasets in diversity and information\ncontent, covering world knowledge, object properties, spatial relationships,\nand aesthetic evaluations. Specifically, ShareGPT4V originates from a curated\n100K high-quality captions collected from advanced GPT4-Vision and has been\nexpanded to 1.2M with a superb caption model trained on this subset. ShareGPT4V\nfirst demonstrates its effectiveness for the Supervised Fine-Tuning (SFT)\nphase, by substituting an equivalent quantity of detailed captions in existing\nSFT datasets with a subset of our high-quality captions, significantly\nenhancing the LMMs like LLaVA-7B, LLaVA-1.5-13B, and Qwen-VL-Chat-7B on the MME\nand MMBench benchmarks, with respective gains of 222.8/22.0/22.3 and\n2.7/1.3/1.5. We further incorporate ShareGPT4V data into both the pre-training\nand SFT phases, obtaining ShareGPT4V-7B, a superior LMM based on a simple\narchitecture that has remarkable performance across a majority of the\nmulti-modal benchmarks. This project is available at\nhttps://ShareGPT4V.github.io to serve as a pivotal resource for advancing the\nLMMs community.\n","authors":["Lin Chen","Jisong Li","Xiaoyi Dong","Pan Zhang","Conghui He","Jiaqi Wang","Feng Zhao","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2311.12793v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12792v1","updated":"2023-11-21T18:58:01Z","published":"2023-11-21T18:58:01Z","title":"Intrinsic Image Decomposition via Ordinal Shading","summary":" Intrinsic decomposition is a fundamental mid-level vision problem that plays\na crucial role in various inverse rendering and computational photography\npipelines. Generating highly accurate intrinsic decompositions is an inherently\nunder-constrained task that requires precisely estimating continuous-valued\nshading and albedo. In this work, we achieve high-resolution intrinsic\ndecomposition by breaking the problem into two parts. First, we present a dense\nordinal shading formulation using a shift- and scale-invariant loss in order to\nestimate ordinal shading cues without restricting the predictions to obey the\nintrinsic model. We then combine low- and high-resolution ordinal estimations\nusing a second network to generate a shading estimate with both global\ncoherency and local details. We encourage the model to learn an accurate\ndecomposition by computing losses on the estimated shading as well as the\nalbedo implied by the intrinsic model. We develop a straightforward method for\ngenerating dense pseudo ground truth using our model's predictions and\nmulti-illumination data, enabling generalization to in-the-wild imagery. We\npresent an exhaustive qualitative and quantitative analysis of our predicted\nintrinsic components against state-of-the-art methods. Finally, we demonstrate\nthe real-world applicability of our estimations by performing otherwise\ndifficult editing tasks such as recoloring and relighting.\n","authors":["Chris Careaga","Yağız Aksoy"],"pdf_url":"https://arxiv.org/pdf/2311.12792v1.pdf","comment":"24 pages, 23 figures, Accepted to ACM Transactions on Graphics\n (2023). Project page: https://yaksoy.github.io/intrinsic/"},{"id":"http://arxiv.org/abs/2305.11818v2","updated":"2023-11-21T18:55:24Z","published":"2023-05-19T16:53:15Z","title":"MaGIC: Multi-modality Guided Image Completion","summary":" Vanilla image completion approaches exhibit sensitivity to large missing\nregions, attributed to the limited availability of reference information for\nplausible generation. To mitigate this, existing methods incorporate the extra\ncue as a guidance for image completion. Despite improvements, these approaches\nare often restricted to employing a single modality (e.g., segmentation or\nsketch maps), which lacks scalability in leveraging multi-modality for more\nplausible completion. In this paper, we propose a novel, simple yet effective\nmethod for Multi-modal Guided Image Completion, dubbed MaGIC, which not only\nsupports a wide range of single modality as the guidance (e.g., text, canny\nedge, sketch, segmentation, depth, and pose), but also adapts to arbitrarily\ncustomized combination of these modalities (i.e., arbitrary multi-modality) for\nimage completion. For building MaGIC, we first introduce a modality-specific\nconditional U-Net (MCU-Net) that injects single-modal signal into a U-Net\ndenoiser for single-modal guided image completion. Then, we devise a consistent\nmodality blending (CMB) method to leverage modality signals encoded in multiple\nlearned MCU-Nets through gradient guidance in latent space. Our CMB is\ntraining-free, thereby avoids the cumbersome joint re-training of different\nmodalities, which is the secret of MaGIC to achieve exceptional flexibility in\naccommodating new modalities for completion. Experiments show the superiority\nof MaGIC over state-of-the-art methods and its generalization to various\ncompletion tasks. Our project with code and models is available at\nyeates.github.io/MaGIC-Page/.\n","authors":["Yongsheng Yu","Hao Wang","Tiejian Luo","Heng Fan","Libo Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.11818v2.pdf","comment":"23 pages, 15 figures"},{"id":"http://arxiv.org/abs/2211.13807v2","updated":"2023-11-21T18:47:51Z","published":"2022-11-24T21:41:52Z","title":"GEFF: Improving Any Clothes-Changing Person ReID Model using Gallery\n Enrichment with Face Features","summary":" In the Clothes-Changing Re-Identification (CC-ReID) problem, given a query\nsample of a person, the goal is to determine the correct identity based on a\nlabeled gallery in which the person appears in different clothes. Several\nmodels tackle this challenge by extracting clothes-independent features.\nHowever, the performance of these models is still lower for the\nclothes-changing setting compared to the same-clothes setting in which the\nperson appears with the same clothes in the labeled gallery. As\nclothing-related features are often dominant features in the data, we propose a\nnew process we call Gallery Enrichment, to utilize these features. In this\nprocess, we enrich the original gallery by adding to it query samples based on\ntheir face features, using an unsupervised algorithm. Additionally, we show\nthat combining ReID and face feature extraction modules alongside an enriched\ngallery results in a more accurate ReID model, even for query samples with new\noutfits that do not include faces. Moreover, we claim that existing CC-ReID\nbenchmarks do not fully represent real-world scenarios, and propose a new video\nCC-ReID dataset called 42Street, based on a theater play that includes crowded\nscenes and numerous clothes changes. When applied to multiple ReID models, our\nmethod (GEFF) achieves an average improvement of 33.5% and 6.7% in the Top-1\nclothes-changing metric on the PRCC and LTCC benchmarks. Combined with the\nlatest ReID models, our method achieves new SOTA results on the PRCC, LTCC,\nCCVID, LaST and VC-Clothes benchmarks and the proposed 42Street dataset.\n","authors":["Daniel Arkushin","Bar Cohen","Shmuel Peleg","Ohad Fried"],"pdf_url":"https://arxiv.org/pdf/2211.13807v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12827v3","updated":"2023-11-21T18:43:43Z","published":"2023-05-22T08:39:25Z","title":"Task Arithmetic in the Tangent Space: Improved Editing of Pre-Trained\n Models","summary":" Task arithmetic has recently emerged as a cost-effective and scalable\napproach to edit pre-trained models directly in weight space: By adding the\nfine-tuned weights of different tasks, the model's performance can be improved\non these tasks, while negating them leads to task forgetting. Yet, our\nunderstanding of the effectiveness of task arithmetic and its underlying\nprinciples remains limited. We present a comprehensive study of task arithmetic\nin vision-language models and show that weight disentanglement is the crucial\nfactor that makes it effective. This property arises during pre-training and\nmanifests when distinct directions in weight space govern separate, localized\nregions in function space associated with the tasks. Notably, we show that\nfine-tuning models in their tangent space by linearizing them amplifies weight\ndisentanglement. This leads to substantial performance improvements across\nmultiple task arithmetic benchmarks and diverse models. Building on these\nfindings, we provide theoretical and empirical analyses of the neural tangent\nkernel (NTK) of these models and establish a compelling link between task\narithmetic and the spatial localization of the NTK eigenfunctions. Overall, our\nwork uncovers novel insights into the fundamental mechanisms of task arithmetic\nand offers a more reliable and effective approach to edit pre-trained models\nthrough the NTK linearization.\n","authors":["Guillermo Ortiz-Jimenez","Alessandro Favero","Pascal Frossard"],"pdf_url":"https://arxiv.org/pdf/2305.12827v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12775v1","updated":"2023-11-21T18:38:03Z","published":"2023-11-21T18:38:03Z","title":"SuGaR: Surface-Aligned Gaussian Splatting for Efficient 3D Mesh\n Reconstruction and High-Quality Mesh Rendering","summary":" We propose a method to allow precise and extremely fast mesh extraction from\n3D Gaussian Splatting. Gaussian Splatting has recently become very popular as\nit yields realistic rendering while being significantly faster to train than\nNeRFs. It is however challenging to extract a mesh from the millions of tiny 3D\ngaussians as these gaussians tend to be unorganized after optimization and no\nmethod has been proposed so far. Our first key contribution is a regularization\nterm that encourages the gaussians to align well with the surface of the scene.\nWe then introduce a method that exploits this alignment to extract a mesh from\nthe Gaussians using Poisson reconstruction, which is fast, scalable, and\npreserves details, in contrast to the Marching Cubes algorithm usually applied\nto extract meshes from Neural SDFs. Finally, we introduce an optional\nrefinement strategy that binds gaussians to the surface of the mesh, and\njointly optimizes these Gaussians and the mesh through Gaussian splatting\nrendering. This enables easy editing, sculpting, rigging, animating,\ncompositing and relighting of the Gaussians using traditional softwares by\nmanipulating the mesh instead of the gaussians themselves. Retrieving such an\neditable mesh for realistic rendering is done within minutes with our method,\ncompared to hours with the state-of-the-art methods on neural SDFs, while\nproviding a better rendering quality.\n","authors":["Antoine Guédon","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2311.12775v1.pdf","comment":"Project Webpage: https://imagine.enpc.fr/~guedona/sugar/"},{"id":"http://arxiv.org/abs/2311.12773v1","updated":"2023-11-21T18:35:21Z","published":"2023-11-21T18:35:21Z","title":"Iris Presentation Attack: Assessing the Impact of Combining Vanadium\n Dioxide Films with Artificial Eyes","summary":" Iris recognition systems, operating in the near infrared spectrum (NIR), have\ndemonstrated vulnerability to presentation attacks, where an adversary uses\nartifacts such as cosmetic contact lenses, artificial eyes or printed iris\nimages in order to circumvent the system. At the same time, a number of\neffective presentation attack detection (PAD) methods have been developed.\nThese methods have demonstrated success in detecting artificial eyes (e.g.,\nfake Van Dyke eyes) as presentation attacks. In this work, we seek to alter the\noptical characteristics of artificial eyes by affixing Vanadium Dioxide (VO2)\nfilms on their surface in various spatial configurations. VO2 films can be used\nto selectively transmit NIR light and can, therefore, be used to regulate the\namount of NIR light from the object that is captured by the iris sensor. We\nstudy the impact of such images produced by the sensor on two state-of-the-art\niris PA detection methods. We observe that the addition of VO2 films on the\nsurface of artificial eyes can cause the PA detection methods to misclassify\nthem as bonafide eyes in some cases. This represents a vulnerability that must\nbe systematically analyzed and effectively addressed.\n","authors":["Darshika Jauhari","Renu Sharma","Cunjian Chen","Nelson Sepulveda","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2311.12773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.11057v3","updated":"2023-11-21T18:34:09Z","published":"2021-11-22T08:55:25Z","title":"Learning to Aggregate Multi-Scale Context for Instance Segmentation in\n Remote Sensing Images","summary":" The task of instance segmentation in remote sensing images, aiming at\nperforming per-pixel labeling of objects at instance level, is of great\nimportance for various civil applications. Despite previous successes, most\nexisting instance segmentation methods designed for natural images encounter\nsharp performance degradations when they are directly applied to top-view\nremote sensing images. Through careful analysis, we observe that the challenges\nmainly come from the lack of discriminative object features due to severe scale\nvariations, low contrasts, and clustered distributions. In order to address\nthese problems, a novel context aggregation network (CATNet) is proposed to\nimprove the feature extraction process. The proposed model exploits three\nlightweight plug-and-play modules, namely dense feature pyramid network\n(DenseFPN), spatial context pyramid (SCP), and hierarchical region of interest\nextractor (HRoIE), to aggregate global visual context at feature, spatial, and\ninstance domains, respectively. DenseFPN is a multi-scale feature propagation\nmodule that establishes more flexible information flows by adopting inter-level\nresidual connections, cross-level dense connections, and feature re-weighting\nstrategy. Leveraging the attention mechanism, SCP further augments the features\nby aggregating global spatial context into local regions. For each instance,\nHRoIE adaptively generates RoI features for different downstream tasks.\nExtensive evaluations of the proposed scheme on iSAID, DIOR, NWPU VHR-10, and\nHRSID datasets demonstrate that the proposed approach outperforms\nstate-of-the-arts under similar computational costs. Source code and\npre-trained models are available at https://github.com/yeliudev/CATNet.\n","authors":["Ye Liu","Huifang Li","Chao Hu","Shuang Luo","Yan Luo","Chang Wen Chen"],"pdf_url":"https://arxiv.org/pdf/2111.11057v3.pdf","comment":"Accepted to IEEE Transactions on Neural Networks and Learning Systems\n (TNNLS), 2023"},{"id":"http://arxiv.org/abs/2311.12770v1","updated":"2023-11-21T18:30:40Z","published":"2023-11-21T18:30:40Z","title":"Swift Parameter-free Attention Network for Efficient Super-Resolution","summary":" Single Image Super-Resolution (SISR) is a crucial task in low-level computer\nvision, aiming to reconstruct high-resolution images from low-resolution\ncounterparts. Conventional attention mechanisms have significantly improved\nSISR performance but often result in complex network structures and large\nnumber of parameters, leading to slow inference speed and large model size. To\naddress this issue, we propose the Swift Parameter-free Attention Network\n(SPAN), a highly efficient SISR model that balances parameter count, inference\nspeed, and image quality. SPAN employs a novel parameter-free attention\nmechanism, which leverages symmetric activation functions and residual\nconnections to enhance high-contribution information and suppress redundant\ninformation. Our theoretical analysis demonstrates the effectiveness of this\ndesign in achieving the attention mechanism's purpose. We evaluate SPAN on\nmultiple benchmarks, showing that it outperforms existing efficient\nsuper-resolution models in terms of both image quality and inference speed,\nachieving a significant quality-speed trade-off. This makes SPAN highly\nsuitable for real-world applications, particularly in resource-constrained\nscenarios. Notably, our model attains the best PSNR of 27.09 dB, and the test\nruntime of our team is reduced by 7.08ms in the NTIRE 2023 efficient\nsuper-resolution challenge. Our code and models are made publicly available at\n\\url{https://github.com/hongyuanyu/SPAN}.\n","authors":["Cheng Wan","Hongyuan Yu","Zhiqi Li","Yihang Chen","Yajun Zou","Yuqing Liu","Xuanwu Yin","Kunlong Zuo"],"pdf_url":"https://arxiv.org/pdf/2311.12770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12764v1","updated":"2023-11-21T18:18:50Z","published":"2023-11-21T18:18:50Z","title":"Investigating Weight-Perturbed Deep Neural Networks With Application in\n Iris Presentation Attack Detection","summary":" Deep neural networks (DNNs) exhibit superior performance in various machine\nlearning tasks, e.g., image classification, speech recognition, biometric\nrecognition, object detection, etc. However, it is essential to analyze their\nsensitivity to parameter perturbations before deploying them in real-world\napplications. In this work, we assess the sensitivity of DNNs against\nperturbations to their weight and bias parameters. The sensitivity analysis\ninvolves three DNN architectures (VGG, ResNet, and DenseNet), three types of\nparameter perturbations (Gaussian noise, weight zeroing, and weight scaling),\nand two settings (entire network and layer-wise). We perform experiments in the\ncontext of iris presentation attack detection and evaluate on two publicly\navailable datasets: LivDet-Iris-2017 and LivDet-Iris-2020. Based on the\nsensitivity analysis, we propose improved models simply by perturbing\nparameters of the network without undergoing training. We further combine these\nperturbed models at the score-level and at the parameter-level to improve the\nperformance over the original model. The ensemble at the parameter-level shows\nan average improvement of 43.58% on the LivDet-Iris-2017 dataset and 9.25% on\nthe LivDet-Iris-2020 dataset. The source code is available at\n\\href{https://github.com/redwankarimsony/WeightPerturbation-MSU}{https://github.com/redwankarimsony/WeightPerturbation-MSU}.\n","authors":["Renu Sharma","Redwan Sony","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2311.12764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12760v1","updated":"2023-11-21T18:11:26Z","published":"2023-11-21T18:11:26Z","title":"High-resolution Image-based Malware Classification using Multiple\n Instance Learning","summary":" This paper proposes a novel method of classifying malware into families using\nhigh-resolution greyscale images and multiple instance learning to overcome\nadversarial binary enlargement. Current methods of visualisation-based malware\nclassification largely rely on lossy transformations of inputs such as resizing\nto handle the large, variable-sized images. Through empirical analysis and\nexperimentation, it is shown that these approaches cause crucial information\nloss that can be exploited. The proposed solution divides the images into\npatches and uses embedding-based multiple instance learning with a\nconvolutional neural network and an attention aggregation function for\nclassification. The implementation is evaluated on the Microsoft Malware\nClassification dataset and achieves accuracies of up to $96.6\\%$ on\nadversarially enlarged samples compared to the baseline of $22.8\\%$. The Python\ncode is available online at https://github.com/timppeters/MIL-Malware-Images .\n","authors":["Tim Peters","Hikmat Farhat"],"pdf_url":"https://arxiv.org/pdf/2311.12760v1.pdf","comment":"14 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2211.09945v7","updated":"2023-11-21T18:03:06Z","published":"2022-11-17T23:42:10Z","title":"VeriCompress: A Tool to Streamline the Synthesis of Verified Robust\n Compressed Neural Networks from Scratch","summary":" AI's widespread integration has led to neural networks (NNs) deployment on\nedge and similar limited-resource platforms for safety-critical scenarios. Yet,\nNN's fragility raises concerns about reliable inference. Moreover, constrained\nplatforms demand compact networks. This study introduces VeriCompress, a tool\nthat automates the search and training of compressed models with robustness\nguarantees. These models are well-suited for safety-critical applications and\nadhere to predefined architecture and size limitations, making them deployable\non resource-restricted platforms. The method trains models 2-3 times faster\nthan the state-of-the-art approaches, surpassing relevant baseline approaches\nby average accuracy and robustness gains of 15.1 and 9.8 percentage points,\nrespectively. When deployed on a resource-restricted generic platform, these\nmodels require 5-8 times less memory and 2-4 times less inference time than\nmodels used in verified robustness literature. Our comprehensive evaluation\nacross various model architectures and datasets, including MNIST, CIFAR, SVHN,\nand a relevant pedestrian detection dataset, showcases VeriCompress's capacity\nto identify compressed verified robust models with reduced computation overhead\ncompared to current standards. This underscores its potential as a valuable\ntool for end users, such as developers of safety-critical applications on edge\nor Internet of Things platforms, empowering them to create suitable models for\nsafety-critical, resource-constrained platforms in their respective domains.\n","authors":["Sawinder Kaur","Yi Xiao","Asif Salekin"],"pdf_url":"https://arxiv.org/pdf/2211.09945v7.pdf","comment":"9 pages, 5 tables, 2 figures"},{"id":"http://arxiv.org/abs/2311.12754v1","updated":"2023-11-21T17:59:14Z","published":"2023-11-21T17:59:14Z","title":"SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction","summary":" 3D occupancy prediction is an important task for the robustness of\nvision-centric autonomous driving, which aims to predict whether each point is\noccupied in the surrounding 3D space. Existing methods usually require 3D\noccupancy labels to produce meaningful results. However, it is very laborious\nto annotate the occupancy status of each voxel. In this paper, we propose\nSelfOcc to explore a self-supervised way to learn 3D occupancy using only video\nsequences. We first transform the images into the 3D space (e.g., bird's eye\nview) to obtain 3D representation of the scene. We directly impose constraints\non the 3D representations by treating them as signed distance fields. We can\nthen render 2D images of previous and future frames as self-supervision signals\nto learn the 3D representations. We propose an MVS-embedded strategy to\ndirectly optimize the SDF-induced weights with multiple depth proposals. Our\nSelfOcc outperforms the previous best method SceneRF by 58.7% using a single\nframe as input on SemanticKITTI and is the first self-supervised work that\nproduces reasonable 3D occupancy for surround cameras on Occ3D. SelfOcc\nproduces high-quality depth and achieves state-of-the-art results on novel\ndepth synthesis, monocular depth estimation, and surround-view depth estimation\non the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code:\nhttps://github.com/huang-yh/SelfOcc.\n","authors":["Yuanhui Huang","Wenzhao Zheng","Borui Zhang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.12754v1.pdf","comment":"Code is available at: https://github.com/huang-yh/SelfOcc"},{"id":"http://arxiv.org/abs/2310.02129v2","updated":"2023-11-21T17:59:04Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":" As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code is available at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v2.pdf","comment":"Work in progress, add more experiments"},{"id":"http://arxiv.org/abs/2311.12751v1","updated":"2023-11-21T17:52:30Z","published":"2023-11-21T17:52:30Z","title":"Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with\n Spatially Relation Matching","summary":" Drone navigation through natural language commands remains a significant\nchallenge due to the lack of publicly available multi-modal datasets and the\nintricate demands of fine-grained visual-text alignment. In response to this\npressing need, we present a new human-computer interaction annotation benchmark\ncalled GeoText-1652, meticulously curated through a robust Large Language Model\n(LLM)-based data generation framework and the expertise of pre-trained vision\nmodels. This new dataset seamlessly extends the existing image dataset, \\ie,\nUniversity-1652, with spatial-aware text annotations, encompassing intricate\nimage-text-bounding box associations. Besides, we introduce a new optimization\nobjective to leverage fine-grained spatial associations, called blending\nspatial matching, for region-level spatial relation matching. Extensive\nexperiments reveal that our approach maintains an exceptional recall rate under\nvarying description complexities. This underscores the promising potential of\nour approach in elevating drone control and navigation through the seamless\nintegration of natural language commands in real-world scenarios.\n","authors":["Meng Chu","Zhedong Zheng","Wei Ji","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2311.12751v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.06185v2","updated":"2023-11-21T17:42:42Z","published":"2023-11-10T17:06:28Z","title":"An Automated Pipeline for Tumour-Infiltrating Lymphocyte Scoring in\n Breast Cancer","summary":" Tumour-infiltrating lymphocytes (TILs) are considered as a valuable\nprognostic markers in both triple-negative and human epidermal growth factor\nreceptor 2 (HER2) positive breast cancer. In this study, we introduce an\ninnovative deep learning pipeline based on the Efficient-UNet architecture to\npredict the TILs score for breast cancer whole-slide images (WSIs). We first\nsegment tumour and stromal regions in order to compute a tumour bulk mask. We\nthen detect TILs within the tumour-associated stroma, generating a TILs score\nby closely mirroring the pathologist's workflow. Our method exhibits\nstate-of-the-art performance in segmenting tumour/stroma areas and TILs\ndetection, as demonstrated by internal cross-validation on the TiGER Challenge\ntraining dataset and evaluation on the final leaderboards. Additionally, our\nTILs score proves competitive in predicting survival outcomes within the same\nchallenge, underscoring the clinical relevance and potential of our automated\nTILs scoring pipeline as a breast cancer prognostic tool.\n","authors":["Adam J Shephard","Mostafa Jahanifar","Ruoyu Wang","Muhammad Dawood","Simon Graham","Kastytis Sidlauskas","Syed Ali Khurram","Nasir M Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2311.06185v2.pdf","comment":"5 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2304.14408v3","updated":"2023-11-21T17:21:28Z","published":"2023-03-16T17:30:51Z","title":"Using Scalable Computer Vision to Automate High-throughput Semiconductor\n Characterization","summary":" High-throughput materials synthesis methods have risen in popularity due to\ntheir potential to accelerate the design and discovery of novel functional\nmaterials, such as solution-processed semiconductors. After synthesis, key\nmaterial properties must be measured and characterized to validate discovery\nand provide feedback to optimization cycles. However, with the boom in\ndevelopment of high-throughput synthesis tools that champion production rates\nup to $10^4$ samples per hour with flexible form factors, most sample\ncharacterization methods are either slow (conventional rates of $10^1$ samples\nper hour, approximately 1000x slower) or rigid (e.g., designed for\nstandard-size microplates), resulting in a bottleneck that impedes the\nmaterials-design process. To overcome this challenge, we propose a set of\nautomated material property characterization (autocharacterization) tools that\nleverage the adaptive, parallelizable, and scalable nature of computer vision\nto accelerate the throughput of characterization by 85x compared to the\nnon-automated workflow. We demonstrate a generalizable composition mapping tool\nfor high-throughput synthesized binary material systems as well as two scalable\nautocharacterization algorithms that (1) autonomously compute the band gap of\n200 unique compositions in 6 minutes and (2) autonomously compute the degree of\ndegradation in 200 unique compositions in 20 minutes, generating ultra-high\ncompositional resolution trends of band gap and stability. We demonstrate that\nthe developed band gap and degradation detection autocharacterization methods\nachieve 98.5% accuracy and 96.9% accuracy, respectively, on the\nFA$_{1-x}$MA$_{x}$PbI$_3$, $0\\leq x \\leq 1$ perovskite semiconductor system.\n","authors":["Alexander E. Siemenn","Eunice Aissi","Fang Sheng","Armi Tiihonen","Hamide Kavak","Basita Das","Tonio Buonassisi"],"pdf_url":"https://arxiv.org/pdf/2304.14408v3.pdf","comment":"Manuscript 18 pages; Supplemental 20 pages"},{"id":"http://arxiv.org/abs/2303.17646v2","updated":"2023-11-21T17:07:46Z","published":"2023-03-30T18:23:20Z","title":"XPert: Peripheral Circuit & Neural Architecture Co-search for Area and\n Energy-efficient Xbar-based Computing","summary":" The hardware-efficiency and accuracy of Deep Neural Networks (DNNs)\nimplemented on In-memory Computing (IMC) architectures primarily depend on the\nDNN architecture and the peripheral circuit parameters. It is therefore\nessential to holistically co-search the network and peripheral parameters to\nachieve optimal performance. To this end, we propose XPert, which co-searches\nnetwork architecture in tandem with peripheral parameters such as the type and\nprecision of analog-to-digital converters, crossbar column sharing and the\nlayer-specific input precision using an optimization-based design space\nexploration. Compared to VGG16 baselines, XPert achieves 10.24x (4.7x) lower\nEDAP, 1.72x (1.62x) higher TOPS/W,1.93x (3x) higher TOPS/mm2 at 92.46% (56.7%)\naccuracy for CIFAR10 (TinyImagenet) datasets. The code for this paper is\navailable at https://github.com/Intelligent-Computing-Lab-Yale/XPert.\n","authors":["Abhishek Moitra","Abhiroop Bhattacharjee","Youngeun Kim","Priyadarshini Panda"],"pdf_url":"https://arxiv.org/pdf/2303.17646v2.pdf","comment":"Accepted to Design and Automation Conference (DAC)"},{"id":"http://arxiv.org/abs/2311.12722v1","updated":"2023-11-21T16:51:33Z","published":"2023-11-21T16:51:33Z","title":"Attacking Motion Planners Using Adversarial Perception Errors","summary":" Autonomous driving (AD) systems are often built and tested in a modular\nfashion, where the performance of different modules is measured using\ntask-specific metrics. These metrics should be chosen so as to capture the\ndownstream impact of each module and the performance of the system as a whole.\nFor example, high perception quality should enable prediction and planning to\nbe performed safely. Even though this is true in general, we show here that it\nis possible to construct planner inputs that score very highly on various\nperception quality metrics but still lead to planning failures. In an analogy\nto adversarial attacks on image classifiers, we call such inputs\n\\textbf{adversarial perception errors} and show they can be systematically\nconstructed using a simple boundary-attack algorithm. We demonstrate the\neffectiveness of this algorithm by finding attacks for two different black-box\nplanners in several urban and highway driving scenarios using the CARLA\nsimulator. Finally, we analyse the properties of these attacks and show that\nthey are isolated in the input space of the planner, and discuss their\nimplications for AD system deployment and testing.\n","authors":["Jonathan Sadeghi","Nicholas A. Lord","John Redford","Romain Mueller"],"pdf_url":"https://arxiv.org/pdf/2311.12722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05088v3","updated":"2023-11-21T16:35:26Z","published":"2023-04-11T09:31:07Z","title":"WEAR: An Outdoor Sports Dataset for Wearable and Egocentric Activity\n Recognition","summary":" Though research has shown the complementarity of camera- and inertial-based\ndata, datasets which offer both egocentric video and inertial-based sensor data\nremain scarce. In this paper, we introduce WEAR, an outdoor sports dataset for\nboth vision- and inertial-based human activity recognition (HAR). The dataset\ncomprises data from 18 participants performing a total of 18 different workout\nactivities with untrimmed inertial (acceleration) and camera (egocentric video)\ndata recorded at 10 different outside locations. Unlike previous egocentric\ndatasets, WEAR provides a challenging prediction scenario marked by purposely\nintroduced activity variations as well as an overall small information overlap\nacross modalities. Benchmark results obtained using each modality separately\nshow that each modality interestingly offers complementary strengths and\nweaknesses in their prediction performance. Further, in light of the recent\nsuccess of temporal action localization models following the architecture\ndesign of the ActionFormer, we demonstrate their versatility by applying them\nin a plain fashion using vision, inertial and combined (vision + inertial)\nfeatures as input. Results demonstrate both the applicability of vision-based\ntemporal action localization models for inertial data and fusing both\nmodalities by means of simple concatenation, with the combined approach (vision\n+ inertial features) being able to produce the highest mean average precision\nand close-to-best F1-score. The dataset and code to reproduce experiments is\npublicly available via: https://mariusbock.github.io/wear/\n","authors":["Marius Bock","Hilde Kuehne","Kristof Van Laerhoven","Michael Moeller"],"pdf_url":"https://arxiv.org/pdf/2304.05088v3.pdf","comment":"15 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2309.12969v2","updated":"2023-11-21T16:27:11Z","published":"2023-09-22T16:07:16Z","title":"Detect Every Thing with Few Examples","summary":" Open-set object detection aims at detecting arbitrary categories beyond those\nseen during training. Most recent advancements have adopted the open-vocabulary\nparadigm, utilizing vision-language backbones to represent categories with\nlanguage. In this paper, we introduce DE-ViT, an open-set object detector that\nemploys vision-only DINOv2 backbones and learns new categories through example\nimages instead of language. To improve general detection ability, we transform\nmulti-classification tasks into binary classification tasks while bypassing\nper-class inference, and propose a novel region propagation technique for\nlocalization. We evaluate DE-ViT on open-vocabulary, few-shot, and one-shot\nobject detection benchmark with COCO and LVIS. For COCO, DE-ViT outperforms the\nopen-vocabulary SoTA by 6.9 AP50 and achieves 50 AP50 in novel classes. DE-ViT\nsurpasses the few-shot SoTA by 15 mAP on 10-shot and 7.2 mAP on 30-shot and\none-shot SoTA by 2.8 AP50. For LVIS, DE-ViT outperforms the open-vocabulary\nSoTA by 2.2 mask AP and reaches 34.3 mask APr. Code is available at\nhttps://github.com/mlzxy/devit.\n","authors":["Xinyu Zhang","Yuting Wang","Abdeslam Boularias"],"pdf_url":"https://arxiv.org/pdf/2309.12969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12704v1","updated":"2023-11-21T16:19:14Z","published":"2023-11-21T16:19:14Z","title":"Cascade Learning Localises Discriminant Features in Visual Scene\n Classification","summary":" Lack of interpretability of deep convolutional neural networks (DCNN) is a\nwell-known problem particularly in the medical domain as clinicians want\ntrustworthy automated decisions. One way to improve trust is to demonstrate the\nlocalisation of feature representations with respect to expert labeled regions\nof interest. In this work, we investigate the localisation of features learned\nvia two varied learning paradigms and demonstrate the superiority of one\nlearning approach with respect to localisation. Our analysis on medical and\nnatural datasets show that the traditional end-to-end (E2E) learning strategy\nhas a limited ability to localise discriminative features across multiple\nnetwork layers. We show that a layer-wise learning strategy, namely cascade\nlearning (CL), results in more localised features. Considering localisation\naccuracy, we not only show that CL outperforms E2E but that it is a promising\nmethod of predicting regions. On the YOLO object detection framework, our best\nresult shows that CL outperforms the E2E scheme by $2\\%$ in mAP.\n","authors":["Junwen Wang","Katayoun Farrahi"],"pdf_url":"https://arxiv.org/pdf/2311.12704v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08942v2","updated":"2023-11-21T16:14:54Z","published":"2022-11-16T14:44:27Z","title":"Differentially Private Optimizers Can Learn Adversarially Robust Models","summary":" Machine learning models have shone in a variety of domains and attracted\nincreasing attention from both the security and the privacy communities. One\nimportant yet worrying question is: Will training models under the differential\nprivacy (DP) constraint have an unfavorable impact on their adversarial\nrobustness? While previous works have postulated that privacy comes at the cost\nof worse robustness, we give the first theoretical analysis to show that DP\nmodels can indeed be robust and accurate, even sometimes more robust than their\nnaturally-trained non-private counterparts. We observe three key factors that\ninfluence the privacy-robustness-accuracy tradeoff: (1) hyper-parameters for DP\noptimizers are critical; (2) pre-training on public data significantly\nmitigates the accuracy and robustness drop; (3) choice of DP optimizers makes a\ndifference. With these factors set properly, we achieve 90\\% natural accuracy,\n72\\% robust accuracy ($+9\\%$ than the non-private model) under $l_2(0.5)$\nattack, and 69\\% robust accuracy ($+16\\%$ than the non-private model) with\npre-trained SimCLRv2 model under $l_\\infty(4/255)$ attack on CIFAR10 with\n$\\epsilon=2$. In fact, we show both theoretically and empirically that DP\nmodels are Pareto optimal on the accuracy-robustness tradeoff. Empirically, the\nrobustness of DP models is consistently observed across various datasets and\nmodels. We believe our encouraging results are a significant step towards\ntraining models that are private as well as robust.\n","authors":["Yuan Zhang","Zhiqi Bu"],"pdf_url":"https://arxiv.org/pdf/2211.08942v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.12311v3","updated":"2023-11-21T15:40:54Z","published":"2020-12-22T19:32:52Z","title":"Influencer Videos: Unboxing the Mystique","summary":" Influencer marketing has become a very popular tool to reach customers.\nDespite the rapid growth in influencer videos, there has been little research\non the effectiveness of their constituent features in explaining video\nengagement. We study YouTube influencers and analyze their unstructured video\ndata across text, audio and images using an \"interpretable deep learning\"\nframework that accomplishes both goals of prediction and interpretation. Our\nprediction-based approach analyzes unstructured data and finds that \"what is\nsaid\" in words (text) is more influential than \"how it is said\" in imagery\n(images) or acoustics (audio). Our novel interpretation-based approach is\nimplemented after completion of model prediction by analyzing the same source\nof unstructured data to measure importance attributed to the video features. We\neliminate several spurious relationships in two steps, identifying a subset of\nrelationships which are confirmed using theory. We uncover novel findings that\nestablish distinct associations for measures of shallow and deep engagement\nbased on the dual-system framework of human thinking. Our approach is validated\nusing simulated data, and we discuss the learnings from our findings for\ninfluencers and brands.\n","authors":["Prashant Rajaram","Puneet Manchanda"],"pdf_url":"https://arxiv.org/pdf/2012.12311v3.pdf","comment":"45 pages, Online Appendix"},{"id":"http://arxiv.org/abs/2311.12682v1","updated":"2023-11-21T15:39:21Z","published":"2023-11-21T15:39:21Z","title":"Transferring to Real-World Layouts: A Depth-aware Framework for Scene\n Adaptation","summary":" Scene segmentation via unsupervised domain adaptation (UDA) enables the\ntransfer of knowledge acquired from source synthetic data to real-world target\ndata, which largely reduces the need for manual pixel-level annotations in the\ntarget domain. To facilitate domain-invariant feature learning, existing\nmethods typically mix data from both the source domain and target domain by\nsimply copying and pasting the pixels. Such vanilla methods are usually\nsub-optimal since they do not take into account how well the mixed layouts\ncorrespond to real-world scenarios. Real-world scenarios are with an inherent\nlayout. We observe that semantic categories, such as sidewalks, buildings, and\nsky, display relatively consistent depth distributions, and could be clearly\ndistinguished in a depth map. Based on such observation, we propose a\ndepth-aware framework to explicitly leverage depth estimation to mix the\ncategories and facilitate the two complementary tasks, i.e., segmentation and\ndepth learning in an end-to-end manner. In particular, the framework contains a\nDepth-guided Contextual Filter (DCF) forndata augmentation and a cross-task\nencoder for contextual learning. DCF simulates the real-world layouts, while\nthe cross-task encoder further adaptively fuses the complementing features\nbetween two tasks. Besides, it is worth noting that several public datasets do\nnot provide depth annotation. Therefore, we leverage the off-the-shelf depth\nestimation network to generate the pseudo depth. Extensive experiments show\nthat our proposed methods, even with pseudo depth, achieve competitive\nperformance on two widely-used bench-marks, i.e. 77.7 mIoU on GTA to Cityscapes\nand 69.3 mIoU on Synthia to Cityscapes.\n","authors":["Mu Chen","Zhedong Zheng","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2311.12682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12679v1","updated":"2023-11-21T15:37:19Z","published":"2023-11-21T15:37:19Z","title":"BundleMoCap: Efficient, Robust and Smooth Motion Capture from Sparse\n Multiview Videos","summary":" Capturing smooth motions from videos using markerless techniques typically\ninvolves complex processes such as temporal constraints, multiple stages with\ndata-driven regression and optimization, and bundle solving over temporal\nwindows. These processes can be inefficient and require tuning multiple\nobjectives across stages. In contrast, BundleMoCap introduces a novel and\nefficient approach to this problem. It solves the motion capture task in a\nsingle stage, eliminating the need for temporal smoothness objectives while\nstill delivering smooth motions. BundleMoCap outperforms the state-of-the-art\nwithout increasing complexity. The key concept behind BundleMoCap is manifold\ninterpolation between latent keyframes. By relying on a local manifold\nsmoothness assumption, we can efficiently solve a bundle of frames using a\nsingle code. Additionally, the method can be implemented as a sliding window\noptimization and requires only the first frame to be properly initialized,\nreducing the overall computational burden. BundleMoCap's strength lies in its\nability to achieve high-quality motion capture results with simplicity and\nefficiency. More details can be found at https://moverseai.github.io/bundle/.\n","authors":["Georgios Albanis","Nikolaos Zioulis","Kostas Kolomvatsos"],"pdf_url":"https://arxiv.org/pdf/2311.12679v1.pdf","comment":"Published in European Conference on Visual Media Production (CVMP\n '23)"},{"id":"http://arxiv.org/abs/2311.00187v2","updated":"2023-11-21T15:25:15Z","published":"2023-10-31T23:19:30Z","title":"Decodable and Sample Invariant Continuous Object Encoder","summary":" We propose Hyper-Dimensional Function Encoding (HDFE). Given samples of a\ncontinuous object (e.g. a function), HDFE produces an explicit vector\nrepresentation of the given object, invariant to the sample distribution and\ndensity. Sample distribution and density invariance enables HDFE to\nconsistently encode continuous objects regardless of their sampling, and\ntherefore allows neural networks to receive continuous objects as inputs for\nmachine learning tasks, such as classification and regression. Besides, HDFE\ndoes not require any training and is proved to map the object into an organized\nembedding space, which facilitates the training of the downstream tasks. In\naddition, the encoding is decodable, which enables neural networks to regress\ncontinuous objects by regressing their encodings. Therefore, HDFE serves as an\ninterface for processing continuous objects.\n We apply HDFE to function-to-function mapping, where vanilla HDFE achieves\ncompetitive performance as the state-of-the-art algorithm. We apply HDFE to\npoint cloud surface normal estimation, where a simple replacement from PointNet\nto HDFE leads to immediate 12% and 15% error reductions in two benchmarks. In\naddition, by integrating HDFE into the PointNet-based SOTA network, we improve\nthe SOTA baseline by 2.5% and 1.7% in the same benchmarks.\n","authors":["Dehao Yuan","Furong Huang","Cornelia Fermüller","Yiannis Aloimonos"],"pdf_url":"https://arxiv.org/pdf/2311.00187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2004.07780v5","updated":"2023-11-21T15:22:43Z","published":"2020-04-16T17:18:49Z","title":"Shortcut Learning in Deep Neural Networks","summary":" Deep learning has triggered the current rise of artificial intelligence and\nis the workhorse of today's machine intelligence. Numerous success stories have\nrapidly spread all over science, industry and society, but its limitations have\nonly recently come into focus. In this perspective we seek to distill how many\nof deep learning's problems can be seen as different symptoms of the same\nunderlying problem: shortcut learning. Shortcuts are decision rules that\nperform well on standard benchmarks but fail to transfer to more challenging\ntesting conditions, such as real-world scenarios. Related issues are known in\nComparative Psychology, Education and Linguistics, suggesting that shortcut\nlearning may be a common characteristic of learning systems, biological and\nartificial alike. Based on these observations, we develop a set of\nrecommendations for model interpretation and benchmarking, highlighting recent\nadvances in machine learning to improve robustness and transferability from the\nlab to real-world applications.\n","authors":["Robert Geirhos","Jörn-Henrik Jacobsen","Claudio Michaelis","Richard Zemel","Wieland Brendel","Matthias Bethge","Felix A. Wichmann"],"pdf_url":"https://arxiv.org/pdf/2004.07780v5.pdf","comment":"perspective article published at Nature Machine Intelligence\n (https://doi.org/10.1038/s42256-020-00257-z)"},{"id":"http://arxiv.org/abs/2311.11908v2","updated":"2023-11-21T15:17:00Z","published":"2023-11-20T16:40:29Z","title":"Continual Learning: Applications and the Road Forward","summary":" Continual learning is a sub-field of machine learning, which aims to allow\nmachine learning models to continuously learn on new data, by accumulating\nknowledge without forgetting what was learned in the past. In this work, we\ntake a step back, and ask: \"Why should one care about continual learning in the\nfirst place?\". We set the stage by surveying recent continual learning papers\npublished at three major machine learning conferences, and show that\nmemory-constrained settings dominate the field. Then, we discuss five open\nproblems in machine learning, and even though they seem unrelated to continual\nlearning at first sight, we show that continual learning will inevitably be\npart of their solution. These problems are model-editing, personalization,\non-device learning, faster (re-)training and reinforcement learning. Finally,\nby comparing the desiderata from these unsolved problems and the current\nassumptions in continual learning, we highlight and discuss four future\ndirections for continual learning research. We hope that this work offers an\ninteresting perspective on the future of continual learning, while displaying\nits potential value and the paths we have to pursue in order to make it\nsuccessful. This work is the result of the many discussions the authors had at\nthe Dagstuhl seminar on Deep Continual Learning, in March 2023.\n","authors":["Eli Verwimp","Rahaf Aljundi","Shai Ben-David","Matthias Bethge","Andrea Cossu","Alexander Gepperth","Tyler L. Hayes","Eyke Hüllermeier","Christopher Kanan","Dhireesha Kudithipudi","Christoph H. Lampert","Martin Mundt","Razvan Pascanu","Adrian Popescu","Andreas S. Tolias","Joost van de Weijer","Bing Liu","Vincenzo Lomonaco","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.11908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12663v1","updated":"2023-11-21T15:13:18Z","published":"2023-11-21T15:13:18Z","title":"Similar Document Template Matching Algorithm","summary":" This study outlines a comprehensive methodology for verifying medical\ndocuments, integrating advanced techniques in template extraction, comparison,\nand fraud detection. It begins with template extraction using sophisticated\nregion-of-interest (ROI) methods, incorporating contour analysis and edge\nidentification. Pre-processing steps ensure template clarity through\nmorphological operations and adaptive thresholding. The template comparison\nalgorithm utilizes advanced feature matching with key points and descriptors,\nenhancing robustness through histogram-based analysis for accounting\nvariations. Fraud detection involves the SSIM computation and OCR for textual\ninformation extraction. The SSIM quantifies structural similarity, aiding in\npotential match identification. OCR focuses on critical areas like patient\ndetails, provider information, and billing amounts. Extracted information is\ncompared with a reference dataset, and confidence thresholding ensures reliable\nfraud detection. Adaptive parameters enhance system flexibility for dynamic\nadjustments to varying document layouts. This methodology provides a robust\napproach to medical document verification, addressing complexities in template\nextraction, comparison, fraud detection, and adaptability to diverse document\nstructures.\n","authors":["Harshitha Yenigalla","Bommareddy Revanth Srinivasa Reddy","Batta Venkata Rahul","Nannapuraju Hemanth Raju"],"pdf_url":"https://arxiv.org/pdf/2311.12663v1.pdf","comment":"8 pages,8 figures"},{"id":"http://arxiv.org/abs/2311.12660v1","updated":"2023-11-21T15:08:17Z","published":"2023-11-21T15:08:17Z","title":"Visually Guided Object Grasping","summary":" In this paper we present a visual servoing approach to the problem of object\ngrasping and more generally, to the problem of aligning an end-effector with an\nobject. First we extend the method proposed by Espiau et al. [1] to the case of\na camera which is not mounted onto the robot being controlled and we stress the\nimportance of the real-time estimation of the image Jacobian. Second, we show\nhow to represent a grasp or more generally, an alignment between two solids in\n3-D projective space using an uncalibrated stereo rig. Such a 3-D projective\nrepresentation is view-invariant in the sense that it can be easily mapped into\nan image set-point without any knowledge about the camera parameters. Third, we\nperform an analysis of the performances of the visual servoing algorithm and of\nthe grasping precision that can be expected from this type of approach.\n","authors":["Radu Horaud","Fadi Dornaika","Bernard Espiau"],"pdf_url":"https://arxiv.org/pdf/2311.12660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12655v1","updated":"2023-11-21T14:57:24Z","published":"2023-11-21T14:57:24Z","title":"Hand-Eye Calibration","summary":" Whenever a sensor is mounted on a robot hand it is important to know the\nrelationship between the sensor and the hand. The problem of determining this\nrelationship is referred to as hand-eye calibration, which is important in at\nleast two types of tasks: (i) map sensor centered measurements into the robot\nworkspace and (ii) allow the robot to precisely move the sensor. In the past\nsome solutions were proposed in the particular case of a camera. With almost no\nexception, all existing solutions attempt to solve the homogeneous matrix\nequation AX=XB. First we show that there are two possible formulations of the\nhand-eye calibration problem. One formulation is the classical one that we just\nmentioned. A second formulation takes the form of the following homogeneous\nmatrix equation: MY=M'YB. The advantage of the latter is that the extrinsic and\nintrinsic camera parameters need not be made explicit. Indeed, this formulation\ndirectly uses the 3 by 4 perspective matrices (M and M') associated with two\npositions of the camera. Moreover, this formulation together with the classical\none cover a wider range of camera-based sensors to be calibrated with respect\nto the robot hand. Second, we develop a common mathematical framework to solve\nfor the hand-eye calibration problem using either of the two formulations. We\npresent two methods, (i) a rotation then translation and (ii) a non-linear\nsolver for rotation and translation. Third, we perform a stability analysis\nboth for our two methods and for the classical linear method developed. In the\nlight of this comparison, the non-linear optimization method, that solves for\nrotation and translation simultaneously, seems to be the most robust one with\nrespect to noise and to measurement errors.\n","authors":["Radu Horaud","Fadi Dornaika"],"pdf_url":"https://arxiv.org/pdf/2311.12655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06504v2","updated":"2023-11-21T14:57:09Z","published":"2023-11-11T08:01:40Z","title":"SCL-VI: Self-supervised Context Learning for Visual Inspection of\n Industrial Defects","summary":" The unsupervised visual inspection of defects in industrial products poses a\nsignificant challenge due to substantial variations in product surfaces.\nCurrent unsupervised models struggle to strike a balance between detecting\ntexture and object defects, lacking the capacity to discern latent\nrepresentations and intricate features. In this paper, we present a novel\nself-supervised learning algorithm designed to derive an optimal encoder by\ntackling the renowned jigsaw puzzle. Our approach involves dividing the target\nimage into nine patches, tasking the encoder with predicting the relative\nposition relationships between any two patches to extract rich semantics.\nSubsequently, we introduce an affinity-augmentation method to accentuate\ndifferences between normal and abnormal latent representations. Leveraging the\nclassic support vector data description algorithm yields final detection\nresults. Experimental outcomes demonstrate that our proposed method achieves\noutstanding detection and segmentation performance on the widely used MVTec AD\ndataset, with rates of 95.8% and 96.8%, respectively, establishing a\nstate-of-the-art benchmark for both texture and object defects. Comprehensive\nexperimentation underscores the effectiveness of our approach in diverse\nindustrial applications.\n","authors":["Peng Wang","Haiming Yao","Wenyong Yu"],"pdf_url":"https://arxiv.org/pdf/2311.06504v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12651v1","updated":"2023-11-21T14:53:02Z","published":"2023-11-21T14:53:02Z","title":"Mobile-Seed: Joint Semantic Segmentation and Boundary Detection for\n Mobile Robots","summary":" Precise and rapid delineation of sharp boundaries and robust semantics is\nessential for numerous downstream robotic tasks, such as robot grasping and\nmanipulation, real-time semantic mapping, and online sensor calibration\nperformed on edge computing units. Although boundary detection and semantic\nsegmentation are complementary tasks, most studies focus on lightweight models\nfor semantic segmentation but overlook the critical role of boundary detection.\nIn this work, we introduce Mobile-Seed, a lightweight, dual-task framework\ntailored for simultaneous semantic segmentation and boundary detection. Our\nframework features a two-stream encoder, an active fusion decoder (AFD) and a\ndual-task regularization approach. The encoder is divided into two pathways:\none captures category-aware semantic information, while the other discerns\nboundaries from multi-scale features. The AFD module dynamically adapts the\nfusion of semantic and boundary information by learning channel-wise\nrelationships, allowing for precise weight assignment of each channel.\nFurthermore, we introduce a regularization loss to mitigate the conflicts in\ndual-task learning and deep diversity supervision. Compared to existing\nmethods, the proposed Mobile-Seed offers a lightweight framework to\nsimultaneously improve semantic segmentation performance and accurately locate\nobject boundaries. Experiments on the Cityscapes dataset have shown that\nMobile-Seed achieves notable improvement over the state-of-the-art (SOTA)\nbaseline by 2.2 percentage points (pp) in mIoU and 4.2 pp in mF-score, while\nmaintaining an online inference speed of 23.9 frames-per-second (FPS) with\n1024x2048 resolution input on an RTX 2080 Ti GPU. Additional experiments on\nCamVid and PASCAL Context datasets confirm our method's generalizability. Code\nand additional results are publicly available at\n\\url{https://martin-liao.github.io/Mobile-Seed/}.\n","authors":["Youqi Liao","Shuhao Kang","Jianping Li","Yang Liu","Yun Liu","Zhen Dong","Bisheng Yang","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12651v1.pdf","comment":"8 pages, IEEE conference/letter underreview. Code and additional\n results are available at: \\url{https://martin-liao.github.io/Mobile-Seed/}"},{"id":"http://arxiv.org/abs/2311.12641v1","updated":"2023-11-21T14:41:21Z","published":"2023-11-21T14:41:21Z","title":"Polyhedral Object Recognition by Indexing","summary":" In computer vision, the indexing problem is the problem of recognizing a few\nobjects in a large database of objects while avoiding the help of the classical\nimage-feature-to-object-feature matching paradigm. In this paper we address the\nproblem of recognizing 3-D polyhedral objects from 2-D images by indexing. Both\nthe objects to be recognized and the images are represented by weighted graphs.\nThe indexing problem is therefore the problem of determining whether a graph\nextracted from the image is present or absent in a database of model graphs. We\nintroduce a novel method for performing this graph indexing process which is\nbased both on polynomial characterization of binary and weighted graphs and on\nhashing. We describe in detail this polynomial characterization and then we\nshow how it can be used in the context of polyhedral object recognition. Next\nwe describe a practical recognition-by-indexing system that includes the\norganization of the database, the representation of polyhedral objects in terms\nof 2-D characteristic views, the representation of this views in terms of\nweighted graphs, and the associated image processing. Finally, some\nexperimental results allow the evaluation of the system performance.\n","authors":["Radu Horaud","Humberto Sossa"],"pdf_url":"https://arxiv.org/pdf/2311.12641v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12639v1","updated":"2023-11-21T14:39:18Z","published":"2023-11-21T14:39:18Z","title":"KNVQA: A Benchmark for evaluation knowledge-based VQA","summary":" Within the multimodal field, large vision-language models (LVLMs) have made\nsignificant progress due to their strong perception and reasoning capabilities\nin the visual and language systems. However, LVLMs are still plagued by the two\ncritical issues of object hallucination and factual accuracy, which limit the\npracticality of LVLMs in different scenarios. Furthermore, previous evaluation\nmethods focus more on the comprehension and reasoning of language content but\nlack a comprehensive evaluation of multimodal interactions, thereby resulting\nin potential limitations. To this end, we propose a novel KNVQA-Eval, which is\ndevoted to knowledge-based VQA task evaluation to reflect the factuality of\nmultimodal LVLMs. To ensure the robustness and scalability of the evaluation,\nwe develop a new KNVQA dataset by incorporating human judgment and perception,\naiming to evaluate the accuracy of standard answers relative to AI-generated\nanswers in knowledge-based VQA. This work not only comprehensively evaluates\nthe contextual information of LVLMs using reliable human annotations, but also\nfurther analyzes the fine-grained capabilities of current methods to reveal\npotential avenues for subsequent optimization of LVLMs-based estimators. Our\nproposed VQA-Eval and corresponding dataset KNVQA will facilitate the\ndevelopment of automatic evaluation tools with the advantages of low cost,\nprivacy protection, and reproducibility. Our code will be released upon\npublication.\n","authors":["Sirui Cheng","Siyu Zhang","Jiayi Wu","Muchen Lan"],"pdf_url":"https://arxiv.org/pdf/2311.12639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10122v2","updated":"2023-11-21T14:37:30Z","published":"2023-11-16T10:59:44Z","title":"Video-LLaVA: Learning United Visual Representation by Alignment Before\n Projection","summary":" The Large Vision-Language Model (LVLM) has enhanced the performance of\nvarious downstream tasks in visual-language understanding. Most existing\napproaches encode images and videos into separate feature spaces, which are\nthen fed as inputs to large language models. However, due to the lack of\nunified tokenization for images and videos, namely misalignment before\nprojection, it becomes challenging for a Large Language Model (LLM) to learn\nmulti-modal interactions from several poor projection layers. In this work, we\nunify visual representation into the language feature space to advance the\nfoundational LLM towards a unified LVLM. As a result, we establish a simple but\nrobust LVLM baseline, Video-LLaVA, which learns from a mixed dataset of images\nand videos, mutually enhancing each other. Video-LLaVA achieves superior\nperformances on a broad range of 9 image benchmarks across 5 image\nquestion-answering datasets and 4 image benchmark toolkits. Additionally, our\nVideo-LLaVA also outperforms Video-ChatGPT by 5.8%, 9.9%, 18.6%, and 10.1% on\nMSRVTT, MSVD, TGIF, and ActivityNet, respectively. Notably, extensive\nexperiments demonstrate that Video-LLaVA mutually benefits images and videos\nwithin a unified visual representation, outperforming models designed\nspecifically for images or videos. We aim for this work to provide modest\ninsights into the multi-modal inputs for the LLM.\n","authors":["Bin Lin","Yang Ye","Bin Zhu","Jiaxi Cui","Munan Ning","Peng Jin","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.10122v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12631v1","updated":"2023-11-21T14:24:37Z","published":"2023-11-21T14:24:37Z","title":"GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via\n Blender-Oriented GPT Planning","summary":" Recent advances in text-to-video generation have harnessed the power of\ndiffusion models to create visually compelling content conditioned on text\nprompts. However, they usually encounter high computational costs and often\nstruggle to produce videos with coherent physical motions. To tackle these\nissues, we propose GPT4Motion, a training-free framework that leverages the\nplanning capability of large language models such as GPT, the physical\nsimulation strength of Blender, and the excellent image generation ability of\ntext-to-image diffusion models to enhance the quality of video synthesis.\nSpecifically, GPT4Motion employs GPT-4 to generate a Blender script based on a\nuser textual prompt, which commands Blender's built-in physics engine to craft\nfundamental scene components that encapsulate coherent physical motions across\nframes. Then these components are inputted into Stable Diffusion to generate a\nvideo aligned with the textual prompt. Experimental results on three basic\nphysical motion scenarios, including rigid object drop and collision, cloth\ndraping and swinging, and liquid flow, demonstrate that GPT4Motion can generate\nhigh-quality videos efficiently in maintaining motion coherency and entity\nconsistency. GPT4Motion offers new insights in text-to-video research,\nenhancing its quality and broadening its horizon for future explorations.\n","authors":["Jiaxi Lv","Yi Huang","Mingfu Yan","Jiancheng Huang","Jianzhuang Liu","Yifan Liu","Yafei Wen","Xiaoxin Chen","Shifeng Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12631v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12623v1","updated":"2023-11-21T14:16:57Z","published":"2023-11-21T14:16:57Z","title":"Bridging Generalization Gaps in High Content Imaging Through Online\n Self-Supervised Domain Adaptation","summary":" High Content Imaging (HCI) plays a vital role in modern drug discovery and\ndevelopment pipelines, facilitating various stages from hit identification to\ncandidate drug characterization. Applying machine learning models to these\ndatasets can prove challenging as they typically consist of multiple batches,\naffected by experimental variation, especially if different imaging equipment\nhave been used. Moreover, as new data arrive, it is preferable that they are\nanalyzed in an online fashion. To overcome this, we propose CODA, an online\nself-supervised domain adaptation approach. CODA divides the classifier's role\ninto a generic feature extractor and a task-specific model. We adapt the\nfeature extractor's weights to the new domain using cross-batch\nself-supervision while keeping the task-specific model unchanged. Our results\ndemonstrate that this strategy significantly reduces the generalization gap,\nachieving up to a 300% improvement when applied to data from different labs\nutilizing different microscopes. CODA can be applied to new, unlabeled\nout-of-domain data sources of different sizes, from a single plate to multiple\nexperimental batches.\n","authors":["Johan Fredin Haslum","Christos Matsoukas","Karl-Johan Leuchowius","Kevin Smith"],"pdf_url":"https://arxiv.org/pdf/2311.12623v1.pdf","comment":"IEEE/CVF Winter Conference on Applications of Computer Vision (WACV\n 2024)"},{"id":"http://arxiv.org/abs/2311.12621v1","updated":"2023-11-21T14:12:17Z","published":"2023-11-21T14:12:17Z","title":"Crowd management, crime detection, work monitoring using aiml","summary":" This research endeavors to harness the potential of existing Closed-Circuit\nTelevision (CCTV) networks for a comprehensive approach to crowd management,\ncrime prevention, and workplace monitoring through the integration of\nArtificial Intelligence (AI) and Machine Learning (ML) technologies. The\nprimary objective is to develop and implement advanced algorithms capable of\nreal-time analysis of video feeds, enabling the identification and assessment\nof crowd dynamics, early detection of potential criminal activities, and\ncontinuous monitoring of workplace environments. By leveraging AI/ML, the\nproject aims to optimize surveillance capabilities, thereby enhancing public\nsafety measures and improving organizational productivity. This initiative\nunderscores the transformative impact that intelligent video analytics can have\non existing infrastructure, mitigating the need for extensive system overhauls\nwhile significantly advancing security and operational efficiency.\n","authors":["P. R. Adithya","Dheepak. S","B. Akash","Harshini. V","Sai Lakshana"],"pdf_url":"https://arxiv.org/pdf/2311.12621v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12617v1","updated":"2023-11-21T14:03:16Z","published":"2023-11-21T14:03:16Z","title":"Leveraging Unlabeled Data for 3D Medical Image Segmentation through\n Self-Supervised Contrastive Learning","summary":" Current 3D semi-supervised segmentation methods face significant challenges\nsuch as limited consideration of contextual information and the inability to\ngenerate reliable pseudo-labels for effective unsupervised data use. To address\nthese challenges, we introduce two distinct subnetworks designed to explore and\nexploit the discrepancies between them, ultimately correcting the erroneous\nprediction results. More specifically, we identify regions of inconsistent\npredictions and initiate a targeted verification training process. This\nprocedure strategically fine-tunes and harmonizes the predictions of the\nsubnetworks, leading to enhanced utilization of contextual information.\nFurthermore, to adaptively fine-tune the network's representational capacity\nand reduce prediction uncertainty, we employ a self-supervised contrastive\nlearning paradigm. For this, we use the network's confidence to distinguish\nbetween reliable and unreliable predictions. The model is then trained to\neffectively minimize unreliable predictions. Our experimental results for organ\nsegmentation, obtained from clinical MRI and CT scans, demonstrate the\neffectiveness of our approach when compared to state-of-the-art methods. The\ncodebase is accessible on\n\\href{https://github.com/xmindflow/SSL-contrastive}{GitHub}.\n","authors":["Sanaz Karimijafarbigloo","Reza Azad","Yury Velichko","Ulas Bagci","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2311.12617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01446v3","updated":"2023-11-21T14:02:33Z","published":"2023-09-04T08:54:20Z","title":"Open Sesame! Universal Black Box Jailbreaking of Large Language Models","summary":" Large language models (LLMs), designed to provide helpful and safe responses,\noften rely on alignment techniques to align with user intent and social\nguidelines. Unfortunately, this alignment can be exploited by malicious actors\nseeking to manipulate an LLM's outputs for unintended purposes. In this paper\nwe introduce a novel approach that employs a genetic algorithm (GA) to\nmanipulate LLMs when model architecture and parameters are inaccessible. The GA\nattack works by optimizing a universal adversarial prompt that -- when combined\nwith a user's query -- disrupts the attacked model's alignment, resulting in\nunintended and potentially harmful outputs. Our novel approach systematically\nreveals a model's limitations and vulnerabilities by uncovering instances where\nits responses deviate from expected behavior. Through extensive experiments we\ndemonstrate the efficacy of our technique, thus contributing to the ongoing\ndiscussion on responsible AI development by providing a diagnostic tool for\nevaluating and enhancing alignment of LLMs with human intent. To our knowledge\nthis is the first automated universal black box jailbreak attack.\n","authors":["Raz Lapid","Ron Langberg","Moshe Sipper"],"pdf_url":"https://arxiv.org/pdf/2309.01446v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11306v2","updated":"2023-11-21T13:59:31Z","published":"2023-11-19T11:57:01Z","title":"UMAAF: Unveiling Aesthetics via Multifarious Attributes of Images","summary":" With the increasing prevalence of smartphones and websites, Image Aesthetic\nAssessment (IAA) has become increasingly crucial. While the significance of\nattributes in IAA is widely recognized, many attribute-based methods lack\nconsideration for the selection and utilization of aesthetic attributes. Our\ninitial step involves the acquisition of aesthetic attributes from both intra-\nand inter-perspectives. Within the intra-perspective, we extract the direct\nvisual attributes of images, constituting the absolute attribute. In the\ninter-perspective, our focus lies in modeling the relative score relationships\nbetween images within the same sequence, forming the relative attribute. Then,\nto better utilize image attributes in aesthetic assessment, we propose the\nUnified Multi-attribute Aesthetic Assessment Framework (UMAAF) to model both\nabsolute and relative attributes of images. For absolute attributes, we\nleverage multiple absolute-attribute perception modules and an\nabsolute-attribute interacting network. The absolute-attribute perception\nmodules are first pre-trained on several absolute-attribute learning tasks and\nthen used to extract corresponding absolute attribute features. The\nabsolute-attribute interacting network adaptively learns the weight of diverse\nabsolute-attribute features, effectively integrating them with generic\naesthetic features from various absolute-attribute perspectives and generating\nthe aesthetic prediction. To model the relative attribute of images, we\nconsider the relative ranking and relative distance relationships between\nimages in a Relative-Relation Loss function, which boosts the robustness of the\nUMAAF. Furthermore, UMAAF achieves state-of-the-art performance on TAD66K and\nAVA datasets, and multiple experiments demonstrate the effectiveness of each\nmodule and the model's alignment with human preference.\n","authors":["Weijie Li","Yitian Wan","Xingjiao Wu","Junjie Xu","Cheng Jin","Liang He"],"pdf_url":"https://arxiv.org/pdf/2311.11306v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05858v2","updated":"2023-11-21T13:55:33Z","published":"2023-11-10T03:54:40Z","title":"Layer-wise Auto-Weighting for Non-Stationary Test-Time Adaptation","summary":" Given the inevitability of domain shifts during inference in real-world\napplications, test-time adaptation (TTA) is essential for model adaptation\nafter deployment. However, the real-world scenario of continuously changing\ntarget distributions presents challenges including catastrophic forgetting and\nerror accumulation. Existing TTA methods for non-stationary domain shifts,\nwhile effective, incur excessive computational load, making them impractical\nfor on-device settings. In this paper, we introduce a layer-wise auto-weighting\nalgorithm for continual and gradual TTA that autonomously identifies layers for\npreservation or concentrated adaptation. By leveraging the Fisher Information\nMatrix (FIM), we first design the learning weight to selectively focus on\nlayers associated with log-likelihood changes while preserving unrelated ones.\nThen, we further propose an exponential min-max scaler to make certain layers\nnearly frozen while mitigating outliers. This minimizes forgetting and error\naccumulation, leading to efficient adaptation to non-stationary target\ndistribution. Experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C show our\nmethod outperforms conventional continual and gradual TTA approaches while\nsignificantly reducing computational load, highlighting the importance of\nFIM-based learning weight in adapting to continuously or gradually shifting\ntarget domains.\n","authors":["Junyoung Park","Jin Kim","Hyeongjun Kwon","Ilhoon Yoon","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2311.05858v2.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2311.12610v1","updated":"2023-11-21T13:52:31Z","published":"2023-11-21T13:52:31Z","title":"ChessVision -- A Dataset for Logically Coherent Multi-label\n Classification","summary":" Starting with early successes in computer vision tasks, deep learning based\ntechniques have since overtaken state of the art approaches in a multitude of\ndomains. However, it has been demonstrated time and again that these techniques\nfail to capture semantic context and logical constraints, instead often relying\non spurious correlations to arrive at the answer. Since application of deep\nlearning techniques to critical scenarios are dependent on adherence to domain\nspecific constraints, several attempts have been made to address this issue.\nOne limitation holding back a thorough exploration of this area, is a lack of\nsuitable datasets which feature a rich set of rules. In order to address this,\nwe present the ChessVision Dataset, consisting of 200,000+ images of annotated\nchess games in progress, requiring recreation of the game state from its\ncorresponding image. This is accompanied by a curated set of rules which\nconstrains the set of predictions to \"reasonable\" game states, and are designed\nto probe key semantic abilities like localization and enumeration. Alongside\nstandard metrics, additional metrics to measure performance with regards to\nlogical consistency is presented. We analyze several popular and state of the\nart vision models on this task, and show that, although their performance on\nstandard metrics are laudable, they produce a plethora of incoherent results,\nindicating that this dataset presents a significant challenge for future works.\n","authors":["Soumadeep Saha","Utpal Garain"],"pdf_url":"https://arxiv.org/pdf/2311.12610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12608v1","updated":"2023-11-21T13:49:28Z","published":"2023-11-21T13:49:28Z","title":"Adaptive Dense Pseudo Label Selection for Semi-supervised Oriented\n Object Detection","summary":" Recently, dense pseudo-label, which directly selects pseudo labels from the\noriginal output of the teacher model without any complicated post-processing\nsteps, has received considerable attention in semi-supervised object detection\n(SSOD). However, for the multi-oriented and dense objects that are common in\naerial scenes, existing dense pseudo-label selection methods are inefficient\nand impede the performance in semi-supervised oriented object detection.\nTherefore, we propose Adaptive Dense Pseudo Label Selection (ADPLS) for\nsemi-supervised oriented object detection. In ADPLS, we design a simple but\neffective adaptive mechanism to guide the selection of dense pseudo labels.\nSpecifically, we propose the mean Feature-Richness Score (mFRS) to estimate the\ndensity of potential objects and use this score to adjust the number of dense\npseudo labels. On the DOTA-v1.5 benchmark, the proposed method outperforms\nprevious methods especially when labeled data are scarce. For example, it\nachieves 49.78 mAP given only 5% of annotated data, which surpasses previous\nstate-of-the-art method given 10% of annotated data by 1.15 mAP. Our codes will\nbe available soon.\n","authors":["Tong Zhao","Qiang Fang","Shuohao Shi","Xin Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12608v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2207.04934v2","updated":"2023-11-21T13:46:46Z","published":"2022-07-11T15:15:33Z","title":"Multi-level Geometric Optimization for Regularised Constrained Linear\n Inverse Problems","summary":" We present a geometric multilevel optimization approach that smoothly\nincorporates box constraints. Given a box constrained optimization problem, we\nconsider a hierarchy of models with varying discretization levels. Finer models\nare accurate but expensive to compute, while coarser models are less accurate\nbut cheaper to compute. When working at the fine level, multilevel optimisation\ncomputes the search direction based on a coarser model which speeds up updates\nat the fine level. Moreover, exploiting geometry induced by the hierarchy the\nfeasibility of the updates is preserved. In particular, our approach extends\nclassical components of multigrid methods like restriction and prolongation to\nthe Riemannian structure of our constraints.\n","authors":["Sebastian Müller","Stefania Petra","Matthias Zisler"],"pdf_url":"https://arxiv.org/pdf/2207.04934v2.pdf","comment":"25 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.12603v1","updated":"2023-11-21T13:43:16Z","published":"2023-11-21T13:43:16Z","title":"Surgical Temporal Action-aware Network with Sequence Regularization for\n Phase Recognition","summary":" To assist surgeons in the operating theatre, surgical phase recognition is\ncritical for developing computer-assisted surgical systems, which requires\ncomprehensive understanding of surgical videos. Although existing studies made\ngreat progress, there are still two significant limitations worthy of\nimprovement. First, due to the compromise of resource consumption, frame-wise\nvisual features are extracted by 2D networks and disregard spatial and temporal\nknowledge of surgical actions, which hinders subsequent inter-frame modeling\nfor phase prediction. Second, these works simply utilize ordinary\nclassification loss with one-hot phase labels to optimize the phase\npredictions, and cannot fully explore surgical videos under inadequate\nsupervision. To overcome these two limitations, we propose a Surgical Temporal\nAction-aware Network with sequence Regularization, named STAR-Net, to recognize\nsurgical phases more accurately from input videos. Specifically, we propose an\nefficient multi-scale surgical temporal action (MS-STA) module, which\nintegrates visual features with spatial and temporal knowledge of surgical\nactions at the cost of 2D networks. Moreover, we devise the dual-classifier\nsequence regularization (DSR) to facilitate the training of STAR-Net by the\nsequence guidance of an auxiliary classifier with a smaller capacity. Our\nSTAR-Net with MS-STA and DSR can exploit visual features of surgical actions\nwith effective regularization, thereby leading to the superior performance of\nsurgical phase recognition. Extensive experiments on a large-scale gastrectomy\nsurgery dataset and the public Cholec80 benchmark prove that our STAR-Net\nsignificantly outperforms state-of-the-arts of surgical phase recognition.\n","authors":["Zhen Chen","Yuhao Zhai","Jun Zhang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12603v1.pdf","comment":"Accepted by 2023 IEEE International Conference on Bioinformatics and\n Biomedicine (BIBM 2023)"},{"id":"http://arxiv.org/abs/2311.12602v1","updated":"2023-11-21T13:43:06Z","published":"2023-11-21T13:43:06Z","title":"TouchSDF: A DeepSDF Approach for 3D Shape Reconstruction using\n Vision-Based Tactile Sensing","summary":" Humans rely on their visual and tactile senses to develop a comprehensive 3D\nunderstanding of their physical environment. Recently, there has been a growing\ninterest in exploring and manipulating objects using data-driven approaches\nthat utilise high-resolution vision-based tactile sensors. However, 3D shape\nreconstruction using tactile sensing has lagged behind visual shape\nreconstruction because of limitations in existing techniques, including the\ninability to generalise over unseen shapes, the absence of real-world testing,\nand limited expressive capacity imposed by discrete representations. To address\nthese challenges, we propose TouchSDF, a Deep Learning approach for tactile 3D\nshape reconstruction that leverages the rich information provided by a\nvision-based tactile sensor and the expressivity of the implicit neural\nrepresentation DeepSDF. Our technique consists of two components: (1) a\nConvolutional Neural Network that maps tactile images into local meshes\nrepresenting the surface at the touch location, and (2) an implicit neural\nfunction that predicts a signed distance function to extract the desired 3D\nshape. This combination allows TouchSDF to reconstruct smooth and continuous 3D\nshapes from tactile inputs in simulation and real-world settings, opening up\nresearch avenues for robust 3D-aware representations and improved multimodal\nperception in robotics. Code and supplementary material are available at:\nhttps://touchsdf.github.io/\n","authors":["Mauro Comi","Yijiong Lin","Alex Church","Alessio Tonioni","Laurence Aitchison","Nathan F. Lepora"],"pdf_url":"https://arxiv.org/pdf/2311.12602v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.12601v1","updated":"2023-11-21T13:42:40Z","published":"2023-11-21T13:42:40Z","title":"Deep learning-based detection of morphological features associated with\n hypoxia in H&E breast cancer whole slide images","summary":" Hypoxia occurs when tumour cells outgrow their blood supply, leading to\nregions of low oxygen levels within the tumour. Calculating hypoxia levels can\nbe an important step in understanding the biology of tumours, their clinical\nprogression and response to treatment. This study demonstrates a novel\napplication of deep learning to evaluate hypoxia in the context of breast\ncancer histomorphology. More precisely, we show that Weakly Supervised Deep\nLearning (WSDL) models can accurately detect hypoxia associated features in\nroutine Hematoxylin and Eosin (H&E) whole slide images (WSI). We trained and\nevaluated a deep Multiple Instance Learning model on tiles from WSI H&E tissue\nfrom breast cancer primary sites (n=240) obtaining on average an AUC of 0.87 on\na left-out test set. We also showed significant differences between features of\nhypoxic and normoxic tissue regions as distinguished by the WSDL models. Such\nDL hypoxia H&E WSI detection models could potentially be extended to other\ntumour types and easily integrated into the pathology workflow without\nrequiring additional costly assays.\n","authors":["Petru Manescu","Joseph Geradts","Delmiro Fernandez-Reyes"],"pdf_url":"https://arxiv.org/pdf/2311.12601v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.12589v1","updated":"2023-11-21T13:26:13Z","published":"2023-11-21T13:26:13Z","title":"Improving Source-Free Target Adaptation with Vision Transformers\n Leveraging Domain Representation Images","summary":" Unsupervised Domain Adaptation (UDA) methods facilitate knowledge transfer\nfrom a labeled source domain to an unlabeled target domain, navigating the\nobstacle of domain shift. While Convolutional Neural Networks (CNNs) are a\nstaple in UDA, the rise of Vision Transformers (ViTs) provides new avenues for\ndomain generalization. This paper presents an innovative method to bolster ViT\nperformance in source-free target adaptation, beginning with an evaluation of\nhow key, query, and value elements affect ViT outcomes. Experiments indicate\nthat altering the key component has negligible effects on Transformer\nperformance. Leveraging this discovery, we introduce Domain Representation\nImages (DRIs), feeding embeddings through the key element. DRIs act as\ndomain-specific markers, effortlessly merging with the training regimen. To\nassess our method, we perform target adaptation tests on the Cross Instance DRI\nsource-only (SO) control. We measure the efficacy of target adaptation with and\nwithout DRIs, against existing benchmarks like SHOT-B* and adaptations via\nCDTrans. Findings demonstrate that excluding DRIs offers limited gains over\nSHOT-B*, while their inclusion in the key segment boosts average precision\npromoting superior domain generalization. This research underscores the vital\nrole of DRIs in enhancing ViT efficiency in UDA scenarios, setting a precedent\nfor further domain adaptation explorations.\n","authors":["Gauransh Sawhney","Daksh Dave","Adeel Ahmed","Jiechao Gao","Khalid Saleem"],"pdf_url":"https://arxiv.org/pdf/2311.12589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12588v1","updated":"2023-11-21T13:21:22Z","published":"2023-11-21T13:21:22Z","title":"HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning\n for RGB-D 6DoF Object Pose Estimation","summary":" In this work, we present a novel dense-correspondence method for 6DoF object\npose estimation from a single RGB-D image. While many existing data-driven\nmethods achieve impressive performance, they tend to be time-consuming due to\ntheir reliance on rendering-based refinement approaches. To circumvent this\nlimitation, we present HiPose, which establishes 3D-3D correspondences in a\ncoarse-to-fine manner with a hierarchical binary surface encoding. Unlike\nprevious dense-correspondence methods, we estimate the correspondence surface\nby employing point-to-surface matching and iteratively constricting the surface\nuntil it becomes a correspondence point while gradually removing outliers.\nExtensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate\nthat our method surpasses all refinement-free methods and is even on par with\nexpensive refinement-based approaches. Crucially, our approach is\ncomputationally efficient and enables real-time critical applications with high\naccuracy requirements. Code and models will be released.\n","authors":["Yongliang Lin","Yongzhi Su","Praveen Nathan","Sandeep Inuganti","Yan Di","Martin Sundermeyer","Fabian Manhardt","Didier Stricke","Jason Rambach","Yu Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12582v1","updated":"2023-11-21T13:00:03Z","published":"2023-11-21T13:00:03Z","title":"Echocardiogram Foundation Model -- Application 1: Estimating Ejection\n Fraction","summary":" Cardiovascular diseases stand as the primary global cause of mortality. Among\nthe various imaging techniques available for visualising the heart and\nevaluating its function, echocardiograms emerge as the preferred choice due to\ntheir safety and low cost. Quantifying cardiac function based on\nechocardiograms is very laborious, time-consuming and subject to high\ninteroperator variability. In this work, we introduce EchoAI, an echocardiogram\nfoundation model, that is trained using self-supervised learning (SSL) on 1.5\nmillion echocardiograms. We evaluate our approach by fine-tuning EchoAI to\nestimate the ejection fraction achieving a mean absolute percentage error of\n9.40%. This level of accuracy aligns with the performance of expert\nsonographers.\n","authors":["Adil Dahlan","Cyril Zakka","Abhinav Kumar","Laura Tang","Rohan Shad","Robyn Fong","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2311.12582v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12581v1","updated":"2023-11-21T12:56:42Z","published":"2023-11-21T12:56:42Z","title":"A Region of Interest Focused Triple UNet Architecture for Skin Lesion\n Segmentation","summary":" Skin lesion segmentation is of great significance for skin lesion analysis\nand subsequent treatment. It is still a challenging task due to the irregular\nand fuzzy lesion borders, and diversity of skin lesions. In this paper, we\npropose Triple-UNet to automatically segment skin lesions. It is an organic\ncombination of three UNet architectures with suitable modules. In order to\nconcatenate the first and second sub-networks more effectively, we design a\nregion of interest enhancement module (ROIE). The ROIE enhances the target\nobject region of the image by using the predicted score map of the first UNet.\nThe features learned by the first UNet and the enhanced image help the second\nUNet obtain a better score map. Finally, the results are fine-tuned by the\nthird UNet. We evaluate our algorithm on a publicly available dataset of skin\nlesion segmentation. Experiments show that Triple-UNet outperforms the\nstate-of-the-art on skin lesion segmentation.\n","authors":["Guoqing Liu","Yu Guo","Caiying Wu","Guoqing Chen","Barintag Saheya","Qiyu Jin"],"pdf_url":"https://arxiv.org/pdf/2311.12581v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2212.02081v2","updated":"2023-11-21T12:43:30Z","published":"2022-12-05T07:52:08Z","title":"YolOOD: Utilizing Object Detection Concepts for Multi-Label\n Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection has attracted a large amount of attention\nfrom the machine learning research community in recent years due to its\nimportance in deployed systems. Most of the previous studies focused on the\ndetection of OOD samples in the multi-class classification task. However, OOD\ndetection in the multi-label classification task, a more common real-world use\ncase, remains an underexplored domain. In this research, we propose YolOOD - a\nmethod that utilizes concepts from the object detection domain to perform OOD\ndetection in the multi-label classification task. Object detection models have\nan inherent ability to distinguish between objects of interest\n(in-distribution) and irrelevant objects (e.g., OOD objects) in images that\ncontain multiple objects belonging to different class categories. These\nabilities allow us to convert a regular object detection model into an image\nclassifier with inherent OOD detection capabilities with just minor changes. We\ncompare our approach to state-of-the-art OOD detection methods and demonstrate\nYolOOD's ability to outperform these methods on a comprehensive suite of\nin-distribution and OOD benchmark datasets.\n","authors":["Alon Zolfi","Guy Amit","Amit Baras","Satoru Koda","Ikuya Morikawa","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2212.02081v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.12562v1","updated":"2023-11-21T12:17:51Z","published":"2023-11-21T12:17:51Z","title":"Multi-Resolution Planar Region Extraction for Uneven Terrains","summary":" This paper studies the problem of extracting planar regions in uneven\nterrains from unordered point cloud measurements. Such a problem is critical in\nvarious robotic applications such as robotic perceptive locomotion. While\nexisting approaches have shown promising results in effectively extracting\nplanar regions from the environment, they often suffer from issues such as low\ncomputational efficiency or loss of resolution. To address these issues, we\npropose a multi-resolution planar region extraction strategy in this paper that\nbalances the accuracy in boundaries and computational efficiency. Our method\nbegins with a pointwise classification preprocessing module, which categorizes\nall sampled points according to their local geometric properties to facilitate\nmulti-resolution segmentation. Subsequently, we arrange the categorized points\nusing an octree, followed by an in-depth analysis of nodes to finish\nmulti-resolution plane segmentation. The efficiency and robustness of the\nproposed approach are verified via synthetic and real-world experiments,\ndemonstrating our method's ability to generalize effectively across various\nuneven terrains while maintaining real-time performance, achieving frame rates\nexceeding 35 FPS.\n","authors":["Yinghan Sun","Linfang Zheng","Hua Chen","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12561v1","updated":"2023-11-21T12:15:28Z","published":"2023-11-21T12:15:28Z","title":"Convolutional Neural Networks for Neuroimaging in Parkinson's Disease:\n Is Preprocessing Needed?","summary":" Spatial and intensity normalization are nowadays a prerequisite for\nneuroimaging analysis. Influenced by voxel-wise and other univariate\ncomparisons, where these corrections are key, they are commonly applied to any\ntype of analysis and imaging modalities. Nuclear imaging modalities such as\nPET-FDG or FP-CIT SPECT, a common modality used in Parkinson's Disease\ndiagnosis, are especially dependent on intensity normalization. However, these\nsteps are computationally expensive and furthermore, they may introduce\ndeformations in the images, altering the information contained in them.\nConvolutional Neural Networks (CNNs), for their part, introduce position\ninvariance to pattern recognition, and have been proven to classify objects\nregardless of their orientation, size, angle, etc. Therefore, a question\narises: how well can CNNs account for spatial and intensity differences when\nanalysing nuclear brain imaging? Are spatial and intensity normalization still\nneeded? To answer this question, we have trained four different CNN models\nbased on well-established architectures, using or not different spatial and\nintensity normalization preprocessing. The results show that a sufficiently\ncomplex model such as our three-dimensional version of the ALEXNET can\neffectively account for spatial differences, achieving a diagnosis accuracy of\n94.1% with an area under the ROC curve of 0.984. The visualization of the\ndifferences via saliency maps shows that these models are correctly finding\npatterns that match those found in the literature, without the need of applying\nany complex spatial normalization procedure. However, the intensity\nnormalization -- and its type -- is revealed as very influential in the results\nand accuracy of the trained model, and therefore must be well accounted.\n","authors":["Francisco J. Martinez-Murcia","Juan M. Górriz","Javier Ramírez","Andrés Ortiz"],"pdf_url":"https://arxiv.org/pdf/2311.12561v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.12560v1","updated":"2023-11-21T12:12:19Z","published":"2023-11-21T12:12:19Z","title":"Benchmarking bias: Expanding clinical AI model card to incorporate bias\n reporting of social and non-social factors","summary":" Clinical AI model reporting cards should be expanded to incorporate a broad\nbias reporting of both social and non-social factors. Non-social factors\nconsider the role of other factors, such as disease dependent, anatomic, or\ninstrument factors on AI model bias, which are essential to ensure safe\ndeployment.\n","authors":["Carolina A. M. Heming","Mohamed Abdalla","Monish Ahluwalia","Linglin Zhang","Hari Trivedi","MinJae Woo","Benjamin Fine","Judy Wawira Gichoya","Leo Anthony Celi","Laleh Seyyed-Kalantari"],"pdf_url":"https://arxiv.org/pdf/2311.12560v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12553v1","updated":"2023-11-21T12:05:56Z","published":"2023-11-21T12:05:56Z","title":"\"HoVer-UNet\": Accelerating HoVerNet with UNet-based multi-class nuclei\n segmentation via knowledge distillation","summary":" We present \"HoVer-UNet\", an approach to distill the knowledge of the\nmulti-branch HoVerNet framework for nuclei instance segmentation and\nclassification in histopathology. We propose a compact, streamlined single UNet\nnetwork with a Mix Vision Transformer backbone, and equip it with a custom loss\nfunction to optimally encode the distilled knowledge of HoVerNet, reducing\ncomputational requirements without compromising performances. We show that our\nmodel achieved results comparable to HoVerNet on the public PanNuke and Consep\ndatasets with a three-fold reduction in inference time. We make the code of our\nmodel publicly available at https://github.com/DIAGNijmegen/HoVer-UNet.\n","authors":["Cristian Tommasino","Cristiano Russo","Antonio Maria Rinaldi","Francesco Ciompi"],"pdf_url":"https://arxiv.org/pdf/2311.12553v1.pdf","comment":"4 pages, 2 figures, submitted to ISBI 2024"},{"id":"http://arxiv.org/abs/2304.01716v3","updated":"2023-11-21T12:05:50Z","published":"2023-04-04T11:25:44Z","title":"Decoupling Dynamic Monocular Videos for Dynamic View Synthesis","summary":" The challenge of dynamic view synthesis from dynamic monocular videos, i.e.,\nsynthesizing novel views for free viewpoints given a monocular video of a\ndynamic scene captured by a moving camera, mainly lies in accurately modeling\nthe dynamic objects of a scene using limited 2D frames, each with a varying\ntimestamp and viewpoint. Existing methods usually require pre-processed 2D\noptical flow and depth maps by off-the-shelf methods to supervise the network,\nmaking them suffer from the inaccuracy of the pre-processed supervision and the\nambiguity when lifting the 2D information to 3D. In this paper, we tackle this\nchallenge in an unsupervised fashion. Specifically, we decouple the motion of\nthe dynamic objects into object motion and camera motion, respectively\nregularized by proposed unsupervised surface consistency and patch-based\nmulti-view constraints. The former enforces the 3D geometric surfaces of moving\nobjects to be consistent over time, while the latter regularizes their\nappearances to be consistent across different viewpoints. Such a fine-grained\nmotion formulation can alleviate the learning difficulty for the network, thus\nenabling it to produce not only novel views with higher quality but also more\naccurate scene flows and depth than existing methods requiring extra\nsupervision.\n","authors":["Meng You","Junhui Hou"],"pdf_url":"https://arxiv.org/pdf/2304.01716v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12539v1","updated":"2023-11-21T11:33:15Z","published":"2023-11-21T11:33:15Z","title":"GMISeg: General Medical Image Segmentation without Re-Training","summary":" Although deep learning models have become the main method for medical image\nsegmentation, they often cannot be extended to unknown segmentation tasks\ninvolving new anatomical structures, image shapes, or labels. For new\nsegmentation tasks, researchers often have to retrain or fine-tune the model,\nwhich is time-consuming and poses a significant obstacle to clinical\nresearchers, who often lack the resources and professional knowledge to train\nneural networks. Therefore, we proposed a general method that can solve unknown\nmedical image segmentation tasks without requiring additional training. Given\nan example set of images and prompts for defining new segmentation tasks,\nGMISeg applies a novel low-rank fine-tuning strategy based on the proposed\napproach to the SAM (Segment Anything Model) image encoder, and works with the\nprompt encoder and mask decoder to fine-tune the labeled dataset without the\nneed for additional training. To achieve generalization of new tasks, we used\nmedical image datasets with different imaging modes for different parts. We\ntrained and generalized GMISeg on a different set of anatomical and imaging\nmodes using cardiac images on other site datasets. We have demonstrated that\nGMISeg outperforms the latest methods on unknown tasks and have conducted a\ncomprehensive analysis and summary of the important performance of the proposed\nmethod.\n","authors":["Jing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.12539v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2304.06131 by other authors"},{"id":"http://arxiv.org/abs/2309.06255v2","updated":"2023-11-21T11:11:57Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multi-modal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multi-modal cooperation, which could not jointly\nutilize all modalities well. Some methods are proposed to identify and enhance\nthe worse learnt modality, but are often hard to provide the fine-grained\nobservation of multi-modal cooperation at sample-level with theoretical\nsupport. Hence, it is essential to reasonably observe and improve the\nfine-grained cooperation between modalities, especially when facing realistic\nscenarios where the modality discrepancy could vary across different samples.\nTo this end, we introduce a fine-grained modality valuation metric to evaluate\nthe contribution of each modality at sample-level. Via modality valuation, we\nregretfully observe that the multi-modal model tends to rely on one specific\nmodality, resulting in other modalities being low-contributing. We further\nanalyze this issue and improve cooperation between modalities by enhancing the\ndiscriminative ability of low-contributing modalities in a targeted manner.\nOverall, our methods reasonably observe the fine-grained uni-modal contribution\nat sample-level and achieve considerable improvement on different multi-modal\nmodels.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2302.10396v3","updated":"2023-11-21T10:56:24Z","published":"2023-02-21T02:07:13Z","title":"Assessing Domain Gap for Continual Domain Adaptation in Object Detection","summary":" To ensure reliable object detection in autonomous systems, the detector must\nbe able to adapt to changes in appearance caused by environmental factors such\nas time of day, weather, and seasons. Continually adapting the detector to\nincorporate these changes is a promising solution, but it can be\ncomputationally costly. Our proposed approach is to selectively adapt the\ndetector only when necessary, using new data that does not have the same\ndistribution as the current training data. To this end, we investigate three\npopular metrics for domain gap evaluation and find that there is a correlation\nbetween the domain gap and detection accuracy. Therefore, we apply the domain\ngap as a criterion to decide when to adapt the detector. Our experiments show\nthat our approach has the potential to improve the efficiency of the detector's\noperation in real-world scenarios, where environmental conditions change in a\ncyclical manner, without sacrificing the overall performance of the detector.\nOur code is publicly available at https://github.com/dadung/DGE-CDA.\n","authors":["Anh-Dzung Doan","Bach Long Nguyen","Surabhi Gupta","Ian Reid","Markus Wagner","Tat-Jun Chin"],"pdf_url":"https://arxiv.org/pdf/2302.10396v3.pdf","comment":"Accepted to CVIU"},{"id":"http://arxiv.org/abs/2310.00582v2","updated":"2023-11-21T10:32:13Z","published":"2023-10-01T05:53:15Z","title":"Pink: Unveiling the Power of Referential Comprehension for Multi-modal\n LLMs","summary":" Multi-modal Large Language Models (MLLMs) have shown remarkable capabilities\nin various multi-modal tasks. Nevertheless, their performance in fine-grained\nimage understanding tasks is still limited. To address this issue, this paper\nproposes a new framework to enhance the fine-grained image understanding\nabilities of MLLMs. Specifically, we present a new method for constructing the\ninstruction tuning dataset at a low cost by leveraging annotations in existing\ndatasets. A self-consistent bootstrapping method is also introduced to extend\nexisting dense object annotations into high-quality\nreferring-expression-bounding-box pairs. These methods enable the generation of\nhigh-quality instruction data which includes a wide range of fundamental\nabilities essential for fine-grained image perception. Moreover, we argue that\nthe visual encoder should be tuned during instruction tuning to mitigate the\ngap between full image perception and fine-grained image perception.\nExperimental results demonstrate the superior performance of our method. For\ninstance, our model exhibits a 5.2% accuracy improvement over Qwen-VL on GQA\nand surpasses the accuracy of Kosmos-2 by 24.7% on RefCOCO_val. We also attain\nthe top rank on the leaderboard of MMBench. This promising performance is\nachieved by training on only publicly available data, making it easily\nreproducible. The models, datasets, and codes are publicly available at\nhttps://github.com/SY-Xuan/Pink.\n","authors":["Shiyu Xuan","Qingpei Guo","Ming Yang","Shiliang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.00582v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.05258v3","updated":"2023-11-21T10:30:07Z","published":"2022-12-10T10:19:53Z","title":"Image augmentation with conformal mappings for a convolutional neural\n network","summary":" For augmentation of the square-shaped image data of a convolutional neural\nnetwork (CNN), we introduce a new method, in which the original images are\nmapped onto a disk with a conformal mapping, rotated around the center of this\ndisk and mapped under such a M\\\"obius transformation that preserves the disk,\nand then mapped back onto their original square shape. This process does not\nresult the loss of information caused by removing areas from near the edges of\nthe original images unlike the typical transformations used in the data\naugmentation for a CNN. We offer here the formulas of all the mappings needed\ntogether with detailed instructions how to write a code for transforming the\nimages. The new method is also tested with simulated data and, according the\nresults, using this method to augment the training data of 10 images into 40\nimages decreases the amount of the error in the predictions by a CNN for a test\nset of 160 images in a statistically significant way (p-value=0.0360).\n","authors":["Oona Rainio","Mohamed M. S. Nasser","Matti Vuorinen","Riku Klén"],"pdf_url":"https://arxiv.org/pdf/2212.05258v3.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2305.05546v2","updated":"2023-11-21T10:02:03Z","published":"2023-05-09T15:32:50Z","title":"ColonMapper: topological mapping and localization for colonoscopy","summary":" We propose a topological mapping and localization system able to operate on\nreal human colonoscopies, despite significant shape and illumination changes.\nThe map is a graph where each node codes a colon location by a set of real\nimages, while edges represent traversability between nodes. For close-in-time\nimages, where scene changes are minor, place recognition can be successfully\nmanaged with the recent transformers-based local feature matching algorithms.\nHowever, under long-term changes -- such as different colonoscopies of the same\npatient -- feature-based matching fails. To address this, we train on real\ncolonoscopies a deep global descriptor achieving high recall with significant\nchanges in the scene. The addition of a Bayesian filter boosts the accuracy of\nlong-term place recognition, enabling relocalization in a previously built map.\nOur experiments show that ColonMapper is able to autonomously build a map and\nlocalize against it in two important use cases: localization within the same\ncolonoscopy or within different colonoscopies of the same patient. Code will be\navailable upon acceptance.\n","authors":["Javier Morlana","Juan D. Tardós","J. M. M. Montiel"],"pdf_url":"https://arxiv.org/pdf/2305.05546v2.pdf","comment":"Under review. ICRA 2024"},{"id":"http://arxiv.org/abs/2311.12490v1","updated":"2023-11-21T10:01:08Z","published":"2023-11-21T10:01:08Z","title":"Hyb-NeRF: A Multiresolution Hybrid Encoding for Neural Radiance Fields","summary":" Recent advances in Neural radiance fields (NeRF) have enabled high-fidelity\nscene reconstruction for novel view synthesis. However, NeRF requires hundreds\nof network evaluations per pixel to approximate a volume rendering integral,\nmaking it slow to train. Caching NeRFs into explicit data structures can\neffectively enhance rendering speed but at the cost of higher memory usage. To\naddress these issues, we present Hyb-NeRF, a novel neural radiance field with a\nmulti-resolution hybrid encoding that achieves efficient neural modeling and\nfast rendering, which also allows for high-quality novel view synthesis. The\nkey idea of Hyb-NeRF is to represent the scene using different encoding\nstrategies from coarse-to-fine resolution levels. Hyb-NeRF exploits\nmemory-efficiency learnable positional features at coarse resolutions and the\nfast optimization speed and local details of hash-based feature grids at fine\nresolutions. In addition, to further boost performance, we embed cone\ntracing-based features in our learnable positional encoding that eliminates\nencoding ambiguity and reduces aliasing artifacts. Extensive experiments on\nboth synthetic and real-world datasets show that Hyb-NeRF achieves faster\nrendering speed with better rending quality and even a lower memory footprint\nin comparison to previous state-of-the-art methods.\n","authors":["Yifan Wang","Yi Gong","Yuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2311.12490v1.pdf","comment":"WACV2024"},{"id":"http://arxiv.org/abs/2311.12486v1","updated":"2023-11-21T09:58:39Z","published":"2023-11-21T09:58:39Z","title":"HCA-Net: Hierarchical Context Attention Network for Intervertebral Disc\n Semantic Labeling","summary":" Accurate and automated segmentation of intervertebral discs (IVDs) in medical\nimages is crucial for assessing spine-related disorders, such as osteoporosis,\nvertebral fractures, or IVD herniation. We present HCA-Net, a novel contextual\nattention network architecture for semantic labeling of IVDs, with a special\nfocus on exploiting prior geometric information. Our approach excels at\nprocessing features across different scales and effectively consolidating them\nto capture the intricate spatial relationships within the spinal cord. To\nachieve this, HCA-Net models IVD labeling as a pose estimation problem, aiming\nto minimize the discrepancy between each predicted IVD location and its\ncorresponding actual joint location. In addition, we introduce a skeletal loss\nterm to reinforce the model's geometric dependence on the spine. This loss\nfunction is designed to constrain the model's predictions to a range that\nmatches the general structure of the human vertebral skeleton. As a result, the\nnetwork learns to reduce the occurrence of false predictions and adaptively\nimproves the accuracy of IVD location estimation. Through extensive\nexperimental evaluation on multi-center spine datasets, our approach\nconsistently outperforms previous state-of-the-art methods on both MRI T1w and\nT2w modalities. The codebase is accessible to the public on\n\\href{https://github.com/xmindflow/HCA-Net}{GitHub}.\n","authors":["Afshin Bozorgpour","Bobby Azad","Reza Azad","Yury Velichko","Ulas Bagci","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2311.12486v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04071v2","updated":"2023-11-21T09:58:34Z","published":"2023-11-07T15:35:56Z","title":"Energy-Calibrated VAE with Test Time Free Lunch","summary":" In this paper, we propose a novel generative model that utilizes a\nconditional Energy-Based Model (EBM) for enhancing Variational Autoencoder\n(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer\nfrom blurry generated samples due to the lack of a tailored training on the\nsamples generated in the generative direction. On the other hand, EBMs can\ngenerate high-quality samples but require expensive Markov Chain Monte Carlo\n(MCMC) sampling. To address these issues, we introduce a conditional EBM for\ncalibrating the generative direction of VAE during training, without requiring\nit for the generation at test time. In particular, we train EC-VAE upon both\nthe input data and the calibrated samples with adaptive weight to enhance\nefficacy while avoiding MCMC sampling at test time. Furthermore, we extend the\ncalibration idea of EC-VAE to variational learning and normalizing flows, and\napply EC-VAE to an additional application of zero-shot image restoration via\nneural transport prior and range-null theory. We evaluate the proposed method\nwith two applications, including image generation and zero-shot image\nrestoration, and the experimental results show that our method achieves the\nstate-of-the-art performance over single-step non-adversarial generation.\n","authors":["Yihong Luo","Siya Qiu","Xingjian Tao","Yujun Cai","Jing Tang"],"pdf_url":"https://arxiv.org/pdf/2311.04071v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2311.12480v1","updated":"2023-11-21T09:44:33Z","published":"2023-11-21T09:44:33Z","title":"Speaker-Adapted End-to-End Visual Speech Recognition for Continuous\n Spanish","summary":" Different studies have shown the importance of visual cues throughout the\nspeech perception process. In fact, the development of audiovisual approaches\nhas led to advances in the field of speech technologies. However, although\nnoticeable results have recently been achieved, visual speech recognition\nremains an open research problem. It is a task in which, by dispensing with the\nauditory sense, challenges such as visual ambiguities and the complexity of\nmodeling silence must be faced. Nonetheless, some of these challenges can be\nalleviated when the problem is approached from a speaker-dependent perspective.\nThus, this paper studies, using the Spanish LIP-RTVE database, how the\nestimation of specialized end-to-end systems for a specific person could affect\nthe quality of speech recognition. First, different adaptation strategies based\non the fine-tuning technique were proposed. Then, a pre-trained CTC/Attention\narchitecture was used as a baseline throughout our experiments. Our findings\nshowed that a two-step fine-tuning process, where the VSR system is first\nadapted to the task domain, provided significant improvements when the speaker\nadaptation was addressed. Furthermore, results comparable to the current state\nof the art were reached even when only a limited amount of data was available.\n","authors":["David Gimeno-Gómez","Carlos-D. Martínez-Hinarejos"],"pdf_url":"https://arxiv.org/pdf/2311.12480v1.pdf","comment":"Accepted in Proceedings of IberSpeech 2022 (\n https://www.isca-speech.org/archive/iberspeech_2022/gimenogomez22_iberspeech.html\n )"},{"id":"http://arxiv.org/abs/2211.15513v2","updated":"2023-11-21T09:42:12Z","published":"2022-11-25T09:41:07Z","title":"Composite Score for Anomaly Detection in Imbalanced Real-World\n Industrial Dataset","summary":" In recent years, the industrial sector has evolved towards its fourth\nrevolution. The quality control domain is particularly interested in advanced\nmachine learning for computer vision anomaly detection. Nevertheless, several\nchallenges have to be faced, including imbalanced datasets, the image\ncomplexity, and the zero-false-negative (ZFN) constraint to guarantee the\nhigh-quality requirement. This paper illustrates a use case for an industrial\npartner, where Printed Circuit Board Assembly (PCBA) images are first\nreconstructed with a Vector Quantized Generative Adversarial Network (VQGAN)\ntrained on normal products. Then, several multi-level metrics are extracted on\na few normal and abnormal images, highlighting anomalies through reconstruction\ndifferences. Finally, a classifer is trained to build a composite anomaly score\nthanks to the metrics extracted. This three-step approach is performed on the\npublic MVTec-AD datasets and on the partner PCBA dataset, where it achieves a\nregular accuracy of 95.69% and 87.93% under the ZFN constraint.\n","authors":["Arnaud Bougaham","Mohammed El Adoui","Isabelle Linden","Benoît Frénay"],"pdf_url":"https://arxiv.org/pdf/2211.15513v2.pdf","comment":"This version of the article has been accepted for publication, after\n peer review and is subject to Springer Nature AM terms of use, but is not the\n Version of Record and does not reflect post-acceptance improvements, or any\n corrections. The Version of Record is available online at:\n https://doi.org/10.1007/s10994-023-06415-9"},{"id":"http://arxiv.org/abs/2306.00917v2","updated":"2023-11-21T09:38:21Z","published":"2023-06-01T17:19:43Z","title":"Vocabulary-free Image Classification","summary":" Recent advances in large vision-language models have revolutionized the image\nclassification paradigm. Despite showing impressive zero-shot capabilities, a\npre-defined set of categories, a.k.a. the vocabulary, is assumed at test time\nfor composing the textual prompts. However, such assumption can be impractical\nwhen the semantic context is unknown and evolving. We thus formalize a novel\ntask, termed as Vocabulary-free Image Classification (VIC), where we aim to\nassign to an input image a class that resides in an unconstrained\nlanguage-induced semantic space, without the prerequisite of a known\nvocabulary. VIC is a challenging task as the semantic space is extremely large,\ncontaining millions of concepts, with hard-to-discriminate fine-grained\ncategories. In this work, we first empirically verify that representing this\nsemantic space by means of an external vision-language database is the most\neffective way to obtain semantically relevant content for classifying the\nimage. We then propose Category Search from External Databases (CaSED), a\nmethod that exploits a pre-trained vision-language model and an external\nvision-language database to address VIC in a training-free manner. CaSED first\nextracts a set of candidate categories from captions retrieved from the\ndatabase based on their semantic similarity to the image, and then assigns to\nthe image the best matching candidate category according to the same\nvision-language model. Experiments on benchmark datasets validate that CaSED\noutperforms other complex vision-language frameworks, while being efficient\nwith much fewer parameters, paving the way for future research in this\ndirection.\n","authors":["Alessandro Conti","Enrico Fini","Massimiliano Mancini","Paolo Rota","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2306.00917v2.pdf","comment":"Accepted at NeurIPS2023, 19 pages, 8 figures, code is available at\n https://github.com/altndrr/vic"},{"id":"http://arxiv.org/abs/2311.12476v1","updated":"2023-11-21T09:37:49Z","published":"2023-11-21T09:37:49Z","title":"MaskFlow: Object-Aware Motion Estimation","summary":" We introduce a novel motion estimation method, MaskFlow, that is capable of\nestimating accurate motion fields, even in very challenging cases with small\nobjects, large displacements and drastic appearance changes. In addition to\nlower-level features, that are used in other Deep Neural Network (DNN)-based\nmotion estimation methods, MaskFlow draws from object-level features and\nsegmentations. These features and segmentations are used to approximate the\nobjects' translation motion field. We propose a novel and effective way of\nincorporating the incomplete translation motion field into a subsequent motion\nestimation network for refinement and completion. We also produced a new\nchallenging synthetic dataset with motion field ground truth, and also provide\nextra ground truth for the object-instance matchings and corresponding\nsegmentation masks. We demonstrate that MaskFlow outperforms state of the art\nmethods when evaluated on our new challenging dataset, whilst still producing\ncomparable results on the popular FlyingThings3D benchmark dataset.\n","authors":["Aria Ahmadi","David R. Walton","Tim Atherton","Cagatay Dikici"],"pdf_url":"https://arxiv.org/pdf/2311.12476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12468v1","updated":"2023-11-21T09:28:00Z","published":"2023-11-21T09:28:00Z","title":"Analysis of Visual Features for Continuous Lipreading in Spanish","summary":" During a conversation, our brain is responsible for combining information\nobtained from multiple senses in order to improve our ability to understand the\nmessage we are perceiving. Different studies have shown the importance of\npresenting visual information in these situations. Nevertheless, lipreading is\na complex task whose objective is to interpret speech when audio is not\navailable. By dispensing with a sense as crucial as hearing, it will be\nnecessary to be aware of the challenge that this lack presents. In this paper,\nwe propose an analysis of different speech visual features with the intention\nof identifying which of them is the best approach to capture the nature of lip\nmovements for natural Spanish and, in this way, dealing with the automatic\nvisual speech recognition task. In order to estimate our system, we present an\naudiovisual corpus compiled from a subset of the RTVE database, which has been\nused in the Albayz\\'in evaluations. We employ a traditional system based on\nHidden Markov Models with Gaussian Mixture Models. Results show that, although\nthe task is difficult, in restricted conditions we obtain recognition results\nwhich determine that using eigenlips in combination with deep features is the\nbest visual approach.\n","authors":["David Gimeno-Gómez","Carlos-D. Martínez-Hinarejos"],"pdf_url":"https://arxiv.org/pdf/2311.12468v1.pdf","comment":"Accepted in Proceedings of IberSpeech 2020 (\n https://www.isca-speech.org/archive/iberspeech_2021/gimenogomez21_iberspeech.html\n )"},{"id":"http://arxiv.org/abs/2311.12467v1","updated":"2023-11-21T09:27:30Z","published":"2023-11-21T09:27:30Z","title":"GLAD: Global-Local View Alignment and Background Debiasing for\n Unsupervised Video Domain Adaptation with Large Domain Gap","summary":" In this work, we tackle the challenging problem of unsupervised video domain\nadaptation (UVDA) for action recognition. We specifically focus on scenarios\nwith a substantial domain gap, in contrast to existing works primarily deal\nwith small domain gaps between labeled source domains and unlabeled target\ndomains. To establish a more realistic setting, we introduce a novel UVDA\nscenario, denoted as Kinetics->BABEL, with a more considerable domain gap in\nterms of both temporal dynamics and background shifts. To tackle the temporal\nshift, i.e., action duration difference between the source and target domains,\nwe propose a global-local view alignment approach. To mitigate the background\nshift, we propose to learn temporal order sensitive representations by temporal\norder learning and background invariant representations by background\naugmentation. We empirically validate that the proposed method shows\nsignificant improvement over the existing methods on the Kinetics->BABEL\ndataset with a large domain gap. The code is available at\nhttps://github.com/KHUVLL/GLAD.\n","authors":["Hyogun Lee","Kyungho Bae","Seongjong Ha","Yumin Ko","Gyeongmoon Park","Jinwoo Choi"],"pdf_url":"https://arxiv.org/pdf/2311.12467v1.pdf","comment":"This is an accepted WACV 2024 paper"},{"id":"http://arxiv.org/abs/2309.00018v2","updated":"2023-11-21T09:22:28Z","published":"2023-08-31T07:53:02Z","title":"Unsupervised discovery of Interpretable Visual Concepts","summary":" Providing interpretability of deep-learning models to non-experts, while\nfundamental for a responsible real-world usage, is challenging. Attribution\nmaps from xAI techniques, such as Integrated Gradients, are a typical example\nof a visualization technique containing a high level of information, but with\ndifficult interpretation. In this paper, we propose two methods, Maximum\nActivation Groups Extraction (MAGE) and Multiscale Interpretable Visualization\n(Ms-IV), to explain the model's decision, enhancing global interpretability.\nMAGE finds, for a given CNN, combinations of features which, globally, form a\nsemantic meaning, that we call concepts. We group these similar feature\npatterns by clustering in ``concepts'', that we visualize through Ms-IV. This\nlast method is inspired by Occlusion and Sensitivity analysis (incorporating\ncausality), and uses a novel metric, called Class-aware Order Correlation\n(CaOC), to globally evaluate the most important image regions according to the\nmodel's decision space. We compare our approach to xAI methods such as LIME and\nIntegrated Gradients. Experimental results evince the Ms-IV higher localization\nand faithfulness values. Finally, qualitative evaluation of combined MAGE and\nMs-IV demonstrates humans' ability to agree, based on the visualization, with\nthe decision of clusters' concepts; and, to detect, among a given set of\nnetworks, the existence of bias.\n","authors":["Caroline Mazini Rodrigues","Nicolas Boutry","Laurent Najman"],"pdf_url":"https://arxiv.org/pdf/2309.00018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/1810.12813v3","updated":"2023-11-21T09:19:21Z","published":"2018-10-30T15:33:47Z","title":"Contextual Hourglass Network for Semantic Segmentation of High\n Resolution Aerial Imagery","summary":" Semantic segmentation for aerial imagery is a challenging and important\nproblem in remotely sensed imagery analysis. In recent years, with the success\nof deep learning, various convolutional neural network (CNN) based models have\nbeen developed. However, due to the varying sizes of the objects and imbalanced\nclass labels, it can be challenging to obtain accurate pixel-wise semantic\nsegmentation results. To address those challenges, we develop a novel semantic\nsegmentation method and call it Contextual Hourglass Network. In our method, in\norder to improve the robustness of the prediction, we design a new contextual\nhourglass module which incorporates attention mechanism on processed\nlow-resolution featuremaps to exploit the contextual semantics. We further\nexploit the stacked encoder-decoder structure by connecting multiple contextual\nhourglass modules from end to end. This architecture can effectively extract\nrich multi-scale features and add more feedback loops for better learning\ncontextual semantics through intermediate supervision. To demonstrate the\nefficacy of our semantic segmentation method, we test it on Potsdam and\nVaihingen datasets. Through the comparisons to other baseline methods, our\nmethod yields the best results on overall performance.\n","authors":["Panfeng Li","Youzuo Lin","Emily Schultz-Fellenz"],"pdf_url":"https://arxiv.org/pdf/1810.12813v3.pdf","comment":"Accepted by ICIP 2019,\n https://cmsworkshops.com/ICIP2019/Papers/AcceptedPapers.asp"},{"id":"http://arxiv.org/abs/2301.05246v2","updated":"2023-11-21T09:18:52Z","published":"2023-01-12T19:00:27Z","title":"Online Class-Incremental Learning For Real-World Food Classification","summary":" Food image classification is essential for monitoring health and tracking\ndietary in image-based dietary assessment methods. However, conventional\nsystems often rely on static datasets with fixed classes and uniform\ndistribution. In contrast, real-world food consumption patterns, shaped by\ncultural, economic, and personal influences, involve dynamic and evolving data.\nThus, require the classification system to cope with continuously evolving\ndata. Online Class Incremental Learning (OCIL) addresses the challenge of\nlearning continuously from a single-pass data stream while adapting to the new\nknowledge and reducing catastrophic forgetting. Experience Replay (ER) based\nOCIL methods store a small portion of previous data and have shown encouraging\nperformance. However, most existing OCIL works assume that the distribution of\nencountered data is perfectly balanced, which rarely happens in real-world\nscenarios. In this work, we explore OCIL for real-world food image\nclassification by first introducing a probabilistic framework to simulate\nrealistic food consumption scenarios. Subsequently, we present an attachable\nDynamic Model Update (DMU) module designed for existing ER methods, which\nenables the selection of relevant images for model training, addressing\nchallenges arising from data repetition and imbalanced sample occurrences\ninherent in realistic food consumption patterns within the OCIL framework. Our\nperformance evaluation demonstrates significant enhancements compared to\nestablished ER methods, showing great potential for lifelong learning in\nreal-world food image classification scenarios. The code of our method is\npublicly accessible at\n\\href{https://gitlab.com/viper-purdue/OCIL-real-world-food-image-classification}{https://gitlab.com/viper-purdue/OCIL-real-world-food-image-classification}\n","authors":["Siddeshwar Raghavan","Jiangpeng He","Fengqing Zhu"],"pdf_url":"https://arxiv.org/pdf/2301.05246v2.pdf","comment":"Accepted at IEEE/CVF Winter Conference on Applications of Computer\n Vision (WACV 2024)"},{"id":"http://arxiv.org/abs/2311.11808v2","updated":"2023-11-21T09:18:14Z","published":"2023-11-20T14:41:44Z","title":"Robot Hand-Eye Calibration using Structure-from-Motion","summary":" In this paper we propose a new flexible method for hand-eye calibration. The\nvast majority of existing hand-eye calibration techniques requires a\ncalibration rig which is used in conjunction with camera pose estimation\nmethods. Instead, we combine structure-from-motion with known robot motions and\nwe show that the solution can be obtained in linear form. The latter solves for\nboth the hand-eye parameters and for the unknown scale factor inherent with\nstructure-from-motion methods. The algebraic analysis that is made possible\nwith such a linear formulation allows to investigate not only the well known\ncase of general screw motions but also such singular motions as pure\ntranslations, pure rotations, and planar motions. In essence, the robot-mounted\ncamera looks to an unknown rigid layout, tracks points over an image sequence\nand estimates the camera-to-robot relationship. Such a self calibration process\nis relevant for unmanned vehicles, robots working in remote places, and so\nforth. We conduct a large number of experiments which validate the quality of\nthe method by comparing it with existing ones.\n","authors":["Nicolas Andreff","Radu Horaud","Bernard Espiau"],"pdf_url":"https://arxiv.org/pdf/2311.11808v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12461v1","updated":"2023-11-21T09:15:24Z","published":"2023-11-21T09:15:24Z","title":"HiFi-Syn: Hierarchical Granularity Discrimination for High-Fidelity\n Synthesis of MR Images with Structure Preservation","summary":" Synthesizing medical images while preserving their structural information is\ncrucial in medical research. In such scenarios, the preservation of anatomical\ncontent becomes especially important. Although recent advances have been made\nby incorporating instance-level information to guide translation, these methods\noverlook the spatial coherence of structural-level representation and the\nanatomical invariance of content during translation. To address these issues,\nwe introduce hierarchical granularity discrimination, which exploits various\nlevels of semantic information present in medical images. Our strategy utilizes\nthree levels of discrimination granularity: pixel-level discrimination using a\nBrain Memory Bank, structure-level discrimination on each brain structure with\na re-weighting strategy to focus on hard samples, and global-level\ndiscrimination to ensure anatomical consistency during translation. The image\ntranslation performance of our strategy has been evaluated on three independent\ndatasets (UK Biobank, IXI, and BraTS 2018), and it has outperformed\nstate-of-the-art algorithms. Particularly, our model excels not only in\nsynthesizing normal structures but also in handling abnormal (pathological)\nstructures, such as brain tumors, despite the variations in contrast observed\nacross different imaging modalities due to their pathological characteristics.\nThe diagnostic value of synthesized MR images containing brain tumors has been\nevaluated by radiologists. This indicates that our model may offer an\nalternative solution in scenarios where specific MR modalities of patients are\nunavailable. Extensive experiments further demonstrate the versatility of our\nmethod, providing unique insights into medical image translation.\n","authors":["Ziqi Yu","Botao Zhao","Shengjie Zhang","Xiang Chen","Jianfeng Feng","Tingying Peng","Xiao-Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12457v1","updated":"2023-11-21T09:12:21Z","published":"2023-11-21T09:12:21Z","title":"LIP-RTVE: An Audiovisual Database for Continuous Spanish in the Wild","summary":" Speech is considered as a multi-modal process where hearing and vision are\ntwo fundamentals pillars. In fact, several studies have demonstrated that the\nrobustness of Automatic Speech Recognition systems can be improved when audio\nand visual cues are combined to represent the nature of speech. In addition,\nVisual Speech Recognition, an open research problem whose purpose is to\ninterpret speech by reading the lips of the speaker, has been a focus of\ninterest in the last decades. Nevertheless, in order to estimate these systems\nin the currently Deep Learning era, large-scale databases are required. On the\nother hand, while most of these databases are dedicated to English, other\nlanguages lack sufficient resources. Thus, this paper presents a\nsemi-automatically annotated audiovisual database to deal with unconstrained\nnatural Spanish, providing 13 hours of data extracted from Spanish television.\nFurthermore, baseline results for both speaker-dependent and\nspeaker-independent scenarios are reported using Hidden Markov Models, a\ntraditional paradigm that has been widely used in the field of Speech\nTechnologies.\n","authors":["David Gimeno-Gómez","Carlos-D. Martínez-Hinarejos"],"pdf_url":"https://arxiv.org/pdf/2311.12457v1.pdf","comment":"Accepted in Proceedings of LREC 2022 (\n https://aclanthology.org/2022.lrec-1.294 )"},{"id":"http://arxiv.org/abs/2305.18183v2","updated":"2023-11-21T09:11:38Z","published":"2023-05-29T16:20:23Z","title":"On Counterfactual Data Augmentation Under Confounding","summary":" Counterfactual data augmentation has recently emerged as a method to mitigate\nconfounding biases in the training data. These biases, such as spurious\ncorrelations, arise due to various observed and unobserved confounding\nvariables in the data generation process. In this paper, we formally analyze\nhow confounding biases impact downstream classifiers and present a causal\nviewpoint to the solutions based on counterfactual data augmentation. We\nexplore how removing confounding biases serves as a means to learn invariant\nfeatures, ultimately aiding in generalization beyond the observed data\ndistribution. Additionally, we present a straightforward yet powerful algorithm\nfor generating counterfactual images, which effectively mitigates the influence\nof confounding effects on downstream classifiers. Through experiments on MNIST\nvariants and the CelebA datasets, we demonstrate how our simple augmentation\nmethod helps existing state-of-the-art methods achieve good results.\n","authors":["Abbavaram Gowtham Reddy","Saketh Bachu","Saloni Dash","Charchit Sharma","Amit Sharma","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2305.18183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2109.13788v2","updated":"2023-11-21T09:08:50Z","published":"2021-09-28T15:07:43Z","title":"PFENet++: Boosting Few-shot Semantic Segmentation with the\n Noise-filtered Context-aware Prior Mask","summary":" In this work, we revisit the prior mask guidance proposed in ``Prior Guided\nFeature Enrichment Network for Few-Shot Segmentation''. The prior mask serves\nas an indicator that highlights the region of interests of unseen categories,\nand it is effective in achieving better performance on different frameworks of\nrecent studies. However, the current method directly takes the maximum\nelement-to-element correspondence between the query and support features to\nindicate the probability of belonging to the target class, thus the broader\ncontextual information is seldom exploited during the prior mask generation. To\naddress this issue, first, we propose the Context-aware Prior Mask (CAPM) that\nleverages additional nearby semantic cues for better locating the objects in\nquery images. Second, since the maximum correlation value is vulnerable to\nnoisy features, we take one step further by incorporating a lightweight Noise\nSuppression Module (NSM) to screen out the unnecessary responses, yielding\nhigh-quality masks for providing the prior knowledge. Both two contributions\nare experimentally shown to have substantial practical merit, and the new model\nnamed PFENet++ significantly outperforms the baseline PFENet as well as all\nother competitors on three challenging benchmarks PASCAL-5$^i$, COCO-20$^i$ and\nFSS-1000. The new state-of-the-art performance is achieved without compromising\nthe efficiency, manifesting the potential for being a new strong baseline in\nfew-shot semantic segmentation. Our code will be available at\nhttps://github.com/luoxiaoliu/PFENet2Plus.\n","authors":["Xiaoliu Luo","Zhuotao Tian","Taiping Zhang","Bei Yu","Yuan Yan Tang","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2109.13788v2.pdf","comment":"The first two authors contribute equally and are listed in\n alphabetical order"},{"id":"http://arxiv.org/abs/2310.08897v2","updated":"2023-11-21T08:51:03Z","published":"2023-10-13T06:58:52Z","title":"Self supervised convolutional kernel based handcrafted feature\n harmonization: Enhanced left ventricle hypertension disease phenotyping on\n echocardiography","summary":" Radiomics, a medical imaging technique, extracts quantitative handcrafted\nfeatures from images to predict diseases. Harmonization in those features\nensures consistent feature extraction across various imaging devices and\nprotocols. Methods for harmonization include standardized imaging protocols,\nstatistical adjustments, and evaluating feature robustness. Myocardial diseases\nsuch as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)\nare diagnosed via echocardiography, but variable imaging settings pose\nchallenges. Harmonization techniques are crucial for applying handcrafted\nfeatures in disease diagnosis in such scenario. Self-supervised learning (SSL)\nenhances data understanding within limited datasets and adapts to diverse data\nsettings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying\nsuperior performance in various tasks. This study focuses on convolutional\nfilters within SSL, using them as preprocessing to convert images into feature\nmaps for handcrafted feature harmonization. Our proposed method excelled in\nharmonization evaluation and exhibited superior LVH classification performance\ncompared to existing methods.\n","authors":["Jina Lee","Youngtaek Hong","Dawun Jeong","Yeonggul Jang","Sihyeon Jeong","Taekgeun Jung","Yeonyee E. Yoon","Inki Moon","Seung-Ah Lee","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2310.08897v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2310.19641v2","updated":"2023-11-21T08:49:00Z","published":"2023-10-30T15:29:48Z","title":"DistNet2D: Leveraging long-range temporal information for efficient\n segmentation and tracking","summary":" Extracting long tracks and lineages from videomicroscopy requires an\nextremely low error rate, which is challenging on complex datasets of dense or\ndeforming cells. Leveraging temporal context is key to overcoming this\nchallenge. We propose DistNet2D, a new deep neural network (DNN) architecture\nfor 2D cell segmentation and tracking that leverages both mid- and long-term\ntemporal information. DistNet2D considers seven frames at the input and uses a\npost-processing procedure that exploits information from the entire video to\ncorrect segmentation errors. DistNet2D outperforms two recent methods on two\nexperimental datasets, one containing densely packed bacterial cells and the\nother containing eukaryotic cells. It is integrated into an ImageJ-based\ngraphical user interface for 2D data visualization, curation, and training.\nFinally, we demonstrate the performance of DistNet2D on correlating the size\nand shape of cells with their transport properties over large statistics, for\nboth bacterial and eukaryotic cells.\n","authors":["Jean Ollion","Martin Maliet","Caroline Giuglaris","Elise Vacher","Maxime Deforet"],"pdf_url":"https://arxiv.org/pdf/2310.19641v2.pdf","comment":"40 pages, 5 figures, 18 supp figures"},{"id":"http://arxiv.org/abs/2311.12437v1","updated":"2023-11-21T08:47:08Z","published":"2023-11-21T08:47:08Z","title":"Learning Site-specific Styles for Multi-institutional Unsupervised\n Cross-modality Domain Adaptation","summary":" Unsupervised cross-modality domain adaptation is a challenging task in\nmedical image analysis, and it becomes more challenging when source and target\ndomain data are collected from multiple institutions. In this paper, we present\nour solution to tackle the multi-institutional unsupervised domain adaptation\nfor the crossMoDA 2023 challenge. First, we perform unpaired image translation\nto translate the source domain images to the target domain, where we design a\ndynamic network to generate synthetic target domain images with controllable,\nsite-specific styles. Afterwards, we train a segmentation model using the\nsynthetic images and further reduce the domain gap by self-training. Our\nsolution achieved the 1st place during both the validation and testing phases\nof the challenge.\n","authors":["Han Liu","Yubo Fan","Zhoubing Xu","Benoit M. Dawant","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2311.12437v1.pdf","comment":"crossMoDA 2023 challenge 1st place solution"},{"id":"http://arxiv.org/abs/2311.12430v1","updated":"2023-11-21T08:42:44Z","published":"2023-11-21T08:42:44Z","title":"AR Visualization System for Ship Detection and Recognition Based on AI","summary":" Augmented reality technology has been widely used in industrial design\ninteraction, exhibition guide, information retrieval and other fields. The\ncombination of artificial intelligence and augmented reality technology has\nalso become a future development trend. This project is an AR visualization\nsystem for ship detection and recognition based on AI, which mainly includes\nthree parts: artificial intelligence module, Unity development module and\nHololens2AR module. This project is based on R3Det algorithm to complete the\ndetection and recognition of ships in remote sensing images. The recognition\nrate of model detection trained on RTX 2080Ti can reach 96%. Then, the 3D model\nof the ship is obtained by ship categories and information and generated in the\nvirtual scene. At the same time, voice module and UI interaction module are\nadded. Finally, we completed the deployment of the project on Hololens2 through\nMRTK. The system realizes the fusion of computer vision and augmented reality\ntechnology, which maps the results of object detection to the AR field, and\nmakes a brave step toward the future technological trend and intelligent\napplication.\n","authors":["Ziqi Ye","Limin Huang","Yongji Wu","Min Hu"],"pdf_url":"https://arxiv.org/pdf/2311.12430v1.pdf","comment":"4 pages,7 figures,IEEE International Conference on Virtual Reality\n and Visualization"},{"id":"http://arxiv.org/abs/2311.07784v2","updated":"2023-11-21T08:23:31Z","published":"2023-11-13T22:21:27Z","title":"A Data-Free Approach to Mitigate Catastrophic Forgetting in Federated\n Class Incremental Learning for Vision Tasks","summary":" Deep learning models often suffer from forgetting previously learned\ninformation when trained on new data. This problem is exacerbated in federated\nlearning (FL), where the data is distributed and can change independently for\neach user. Many solutions are proposed to resolve this catastrophic forgetting\nin a centralized setting. However, they do not apply directly to FL because of\nits unique complexities, such as privacy concerns and resource limitations. To\novercome these challenges, this paper presents a framework for\n$\\textbf{federated class incremental learning}$ that utilizes a generative\nmodel to synthesize samples from past distributions. This data can be later\nexploited alongside the training data to mitigate catastrophic forgetting. To\npreserve privacy, the generative model is trained on the server using data-free\nmethods at the end of each task without requesting data from clients. Moreover,\nour solution does not demand the users to store old data or models, which gives\nthem the freedom to join/leave the training at any time. Additionally, we\nintroduce SuperImageNet, a new regrouping of the ImageNet dataset specifically\ntailored for federated continual learning. We demonstrate significant\nimprovements compared to existing baselines through extensive experiments on\nmultiple datasets.\n","authors":["Sara Babakniya","Zalan Fabian","Chaoyang He","Mahdi Soltanolkotabi","Salman Avestimehr"],"pdf_url":"https://arxiv.org/pdf/2311.07784v2.pdf","comment":"Accepted in NeurIPS 2023. arXiv admin note: text overlap with\n arXiv:2307.00497"},{"id":"http://arxiv.org/abs/2311.12421v1","updated":"2023-11-21T08:21:55Z","published":"2023-11-21T08:21:55Z","title":"Two Views Are Better than One: Monocular 3D Pose Estimation with\n Multiview Consistency","summary":" Deducing a 3D human pose from a single 2D image or 2D keypoints is inherently\nchallenging, given the fundamental ambiguity wherein multiple 3D poses can\ncorrespond to the same 2D representation. The acquisition of 3D data, while\ninvaluable for resolving pose ambiguity, is expensive and requires an intricate\nsetup, often restricting its applicability to controlled lab environments. We\nimprove performance of monocular human pose estimation models using multiview\ndata for fine-tuning. We propose a novel loss function, multiview consistency,\nto enable adding additional training data with only 2D supervision. This loss\nenforces that the inferred 3D pose from one view aligns with the inferred 3D\npose from another view under similarity transformations. Our consistency loss\nsubstantially improves performance for fine-tuning with no available 3D data.\nOur experiments demonstrate that two views offset by 90 degrees are enough to\nobtain good performance, with only marginal improvements by adding more views.\nThus, we enable the acquisition of domain-specific data by capturing activities\nwith off-the-shelf cameras, eliminating the need for elaborate calibration\nprocedures. This research introduces new possibilities for domain adaptation in\n3D pose estimation, providing a practical and cost-effective solution to\ncustomize models for specific applications. The used dataset, featuring\nadditional views, will be made publicly available.\n","authors":["Christian Keilstrup Ingwersen","Anders Bjorholm Dahl","Janus Nørtoft Jensen","Morten Rieger Hannemose"],"pdf_url":"https://arxiv.org/pdf/2311.12421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12419v1","updated":"2023-11-21T08:16:01Z","published":"2023-11-21T08:16:01Z","title":"Board-to-Board: Evaluating Moonboard Grade Prediction Generalization","summary":" Bouldering is a sport where athletes aim to climb up an obstacle using a set\nof defined holds called a route. Typically routes are assigned a grade to\ninform climbers of its difficulty and allow them to more easily track their\nprogression. However, the variation in individual climbers technical and\nphysical attributes and many nuances of an individual route make grading a\ndifficult and often biased task. In this work, we apply classical and\ndeep-learning modelling techniques to the 2016, 2017 and 2019 Moonboard\ndatasets, achieving state of the art grade prediction performance with 0.87 MAE\nand 1.12 RMSE. We achieve this performance on a feature-set that does not\nrequire decomposing routes into individual moves, which is a method common in\nliterature and introduces bias. We also demonstrate the generalization\ncapability of this model between editions and introduce a novel vision-based\nmethod of grade prediction. While the generalization performance of these\ntechniques is below human level performance currently, we propose these methods\nas a basis for future work. Such a tool could be implemented in pre-existing\nmobile applications and would allow climbers to better track their progress and\nassess new routes with reduced bias.\n","authors":["Daniel Petashvili","Matthew Rodda"],"pdf_url":"https://arxiv.org/pdf/2311.12419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12407v1","updated":"2023-11-21T07:54:40Z","published":"2023-11-21T07:54:40Z","title":"Learning Part Motion of Articulated Objects Using Spatially Continuous\n Neural Implicit Representations","summary":" Articulated objects (e.g., doors and drawers) exist everywhere in our life.\nDifferent from rigid objects, articulated objects have higher degrees of\nfreedom and are rich in geometries, semantics, and part functions. Modeling\ndifferent kinds of parts and articulations with nerual networks plays an\nessential role in articulated object understanding and manipulation, and will\nfurther benefit 3D vision and robotics communities. To model articulated\nobjects, most previous works directly encode articulated objects into feature\nrepresentations, without specific designs for parts, articulations and part\nmotions. In this paper, we introduce a novel framework that explicitly\ndisentangles the part motion of articulated objects by predicting the\ntransformation matrix of points on the part surface, using spatially continuous\nneural implicit representations to model the part motion smoothly in the space.\nMore importantly, while many methods could only model a certain kind of joint\nmotion (such as the revolution in the clockwise order), our proposed framework\nis generic to different kinds of joint motions in that transformation matrix\ncan model diverse kinds of joint motions in the space. Quantitative and\nqualitative results of experiments over diverse categories of articulated\nobjects demonstrate the effectiveness of our proposed framework.\n","authors":["Yushi Du","Ruihai Wu","Yan Shen","Hao Dong"],"pdf_url":"https://arxiv.org/pdf/2311.12407v1.pdf","comment":"10 pages, 6 figures. Accepted by BMVC 2023"},{"id":"http://arxiv.org/abs/2311.12401v1","updated":"2023-11-21T07:28:51Z","published":"2023-11-21T07:28:51Z","title":"CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal\n Relationships","summary":" Integrating deep learning and causal discovery has increased the\ninterpretability of Temporal Action Segmentation (TAS) tasks. However,\nframe-level causal relationships exist many complicated noises outside the\nsegment-level, making it infeasible to directly express macro action semantics.\nThus, we propose \\textit{\\textbf{Causal Abstraction Segmentation Refiner\n(CASR)}}, which can refine TAS results from various models by enhancing video\ncausality in marginalizing frame-level casual relationships. Specifically, we\ndefine the equivalent frame-level casual model and segment-level causal model,\nso that the causal adjacency matrix constructed from marginalized frame-level\ncausal relationships has the ability to represent the segmnet-level causal\nrelationships. CASR works out by reducing the difference in the causal\nadjacency matrix between we constructed and pre-segmentation results of\nbackbone models. In addition, we propose a novel evaluation metric Causal Edit\nDistance (CED) to evaluate the causal interpretability. Extensive experimental\nresults on mainstream datasets indicate that CASR significantly surpasses\nexisting various methods in action segmentation performance, as well as in\ncausal explainability and generalization. Our code will be available soon.\n","authors":["Keqing Du","Xinyu Yang","Hang Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09112v2","updated":"2023-11-21T07:27:08Z","published":"2023-07-18T10:02:09Z","title":"NU-MCC: Multiview Compressive Coding with Neighborhood Decoder and\n Repulsive UDF","summary":" Remarkable progress has been made in 3D reconstruction from single-view RGB-D\ninputs. MCC is the current state-of-the-art method in this field, which\nachieves unprecedented success by combining vision Transformers with\nlarge-scale training. However, we identified two key limitations of MCC: 1) The\nTransformer decoder is inefficient in handling large number of query points; 2)\nThe 3D representation struggles to recover high-fidelity details. In this\npaper, we propose a new approach called NU-MCC that addresses these\nlimitations. NU-MCC includes two key innovations: a Neighborhood decoder and a\nRepulsive Unsigned Distance Function (Repulsive UDF). First, our Neighborhood\ndecoder introduces center points as an efficient proxy of input visual\nfeatures, allowing each query point to only attend to a small neighborhood.\nThis design not only results in much faster inference speed but also enables\nthe exploitation of finer-scale visual features for improved recovery of 3D\ntextures. Second, our Repulsive UDF is a novel alternative to the occupancy\nfield used in MCC, significantly improving the quality of 3D object\nreconstruction. Compared to standard UDFs that suffer from holes in results,\nour proposed Repulsive UDF can achieve more complete surface reconstruction.\nExperimental results demonstrate that NU-MCC is able to learn a strong 3D\nrepresentation, significantly advancing the state of the art in single-view 3D\nreconstruction. Particularly, it outperforms MCC by 9.7% in terms of the\nF1-score on the CO3D-v2 dataset with more than 5x faster running speed.\n","authors":["Stefan Lionar","Xiangyu Xu","Min Lin","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2307.09112v2.pdf","comment":"NeurIPS 2023. Project page: https://numcc.github.io/ Code:\n https://github.com/sail-sg/numcc"},{"id":"http://arxiv.org/abs/2311.11700v2","updated":"2023-11-21T07:26:16Z","published":"2023-11-20T12:08:23Z","title":"GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting","summary":" In this paper, we introduce $\\textbf{GS-SLAM}$ that first utilizes 3D\nGaussian representation in the Simultaneous Localization and Mapping (SLAM)\nsystem. It facilitates a better balance between efficiency and accuracy.\nCompared to recent SLAM methods employing neural implicit representations, our\nmethod utilizes a real-time differentiable splatting rendering pipeline that\noffers significant speedup to map optimization and RGB-D re-rendering.\nSpecifically, we propose an adaptive expansion strategy that adds new or\ndeletes noisy 3D Gaussian in order to efficiently reconstruct new observed\nscene geometry and improve the mapping of previously observed areas. This\nstrategy is essential to extend 3D Gaussian representation to reconstruct the\nwhole scene rather than synthesize a static object in existing methods.\nMoreover, in the pose tracking process, an effective coarse-to-fine technique\nis designed to select reliable 3D Gaussian representations to optimize camera\npose, resulting in runtime reduction and robust estimation. Our method achieves\ncompetitive performance compared with existing state-of-the-art real-time\nmethods on the Replica, TUM-RGBD datasets. The source code will be released\nsoon.\n","authors":["Chi Yan","Delin Qu","Dong Wang","Dan Xu","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11700v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.14372v2","updated":"2023-11-21T07:22:04Z","published":"2022-06-29T02:36:53Z","title":"Formalizing and Evaluating Requirements of Perception Systems for\n Automated Vehicles using Spatio-Temporal Perception Logic","summary":" Automated vehicles (AV) heavily depend on robust perception systems. Current\nmethods for evaluating vision systems focus mainly on frame-by-frame\nperformance. Such evaluation methods appear to be inadequate in assessing the\nperformance of a perception subsystem when used within an AV. In this paper, we\npresent a logic -- referred to as Spatio-Temporal Perception Logic (STPL) --\nwhich utilizes both spatial and temporal modalities. STPL enables reasoning\nover perception data using spatial and temporal operators. One major advantage\nof STPL is that it facilitates basic sanity checks on the functional\nperformance of the perception system, even without ground-truth data in some\ncases. We identify a fragment of STPL which is efficiently monitorable offline\nin polynomial time. Finally, we present a range of specifications for AV\nperception systems to highlight the types of requirements that can be expressed\nand analyzed through offline monitoring with STPL.\n","authors":["Mohammad Hekmatnejad","Bardh Hoxha","Jyotirmoy V. Deshmukh","Yezhou Yang","Georgios Fainekos"],"pdf_url":"https://arxiv.org/pdf/2206.14372v2.pdf","comment":"32 pages, 11 figures, 6 tables, 4 algorithms, 2 appendixes"},{"id":"http://arxiv.org/abs/2304.07647v2","updated":"2023-11-21T07:21:50Z","published":"2023-04-15T22:24:05Z","title":"LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene\n Graphs with Weak Supervision","summary":" We propose LASER, a neuro-symbolic approach to learn semantic video\nrepresentations that capture rich spatial and temporal properties in video data\nby leveraging high-level logic specifications. In particular, we formulate the\nproblem in terms of alignment between raw videos and spatio-temporal logic\nspecifications. The alignment algorithm leverages a differentiable symbolic\nreasoner and a combination of contrastive, temporal, and semantics losses. It\neffectively and efficiently trains low-level perception models to extract\nfine-grained video representation in the form of a spatio-temporal scene graph\nthat conforms to the desired high-level specification. In doing so, we explore\na novel methodology that weakly supervises the learning of video semantic\nrepresentations through logic specifications. We evaluate our method on two\ndatasets with rich spatial and temporal specifications:\n20BN-Something-Something and MUGEN. We demonstrate that our method learns\nbetter fine-grained video semantics than existing baselines.\n","authors":["Jiani Huang","Ziyang Li","Mayur Naik","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2304.07647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.09688v3","updated":"2023-11-21T07:20:50Z","published":"2022-08-20T14:15:35Z","title":"Learning Sub-Pixel Disparity Distribution for Light Field Depth\n Estimation","summary":" Light field (LF) depth estimation plays a crucial role in many LF-based\napplications. Existing LF depth estimation methods consider depth estimation as\na regression problem, where a pixel-wise L1 loss is employed to supervise the\ntraining process. However, the disparity map is only a sub-space projection\n(i.e., an expectation) of the disparity distribution, which is essential for\nmodels to learn. In this paper, we propose a simple yet effective method to\nlearn the sub-pixel disparity distribution by fully utilizing the power of deep\nnetworks, especially for LF of narrow baselines. We construct the cost volume\nat the sub-pixel level to produce a finer disparity distribution and design an\nuncertainty-aware focal loss to supervise the predicted disparity distribution\ntoward the ground truth. Extensive experimental results demonstrate the\neffectiveness of our method.Our method significantly outperforms recent\nstate-of-the-art LF depth algorithms on the HCI 4D LF Benchmark in terms of all\nfour accuracy metrics (i.e., BadPix 0.01, BadPix 0.03, BadPix 0.07, and MSE\n$\\times$100). The code and model of the proposed method are available at\n\\url{https://github.com/chaowentao/SubFocal}.\n","authors":["Wentao Chao","Xuechun Wang","Yingqian Wang","Guanghui Wang","Fuqing Duan"],"pdf_url":"https://arxiv.org/pdf/2208.09688v3.pdf","comment":"Accepted by IEEE Transactions on Computational Imaging"},{"id":"http://arxiv.org/abs/2311.12398v1","updated":"2023-11-21T07:19:47Z","published":"2023-11-21T07:19:47Z","title":"RFTrans: Leveraging Refractive Flow of Transparent Objects for Surface\n Normal Estimation and Manipulation","summary":" Transparent objects are widely used in our daily lives, making it important\nto teach robots to interact with them. However, it's not easy because the\nreflective and refractive effects can make RGB-D cameras fail to give accurate\ngeometry measurements. To solve this problem, this paper introduces RFTrans, an\nRGB-D-based method for surface normal estimation and manipulation of\ntransparent objects. By leveraging refractive flow as an intermediate\nrepresentation, RFTrans circumvents the drawbacks of directly predicting the\ngeometry (e.g. surface normal) from RGB images and helps bridge the sim-to-real\ngap. RFTrans integrates the RFNet, which predicts refractive flow, object mask,\nand boundaries, followed by the F2Net, which estimates surface normal from the\nrefractive flow. To make manipulation possible, a global optimization module\nwill take in the predictions, refine the raw depth, and construct the point\ncloud with normal. An analytical grasp planning algorithm, ISF, is followed to\ngenerate the grasp poses. We build a synthetic dataset with physically\nplausible ray-tracing rendering techniques to train the networks. Results show\nthat the RFTrans trained on the synthetic dataset can consistently outperform\nthe baseline ClearGrasp in both synthetic and real-world benchmarks by a large\nmargin. Finally, a real-world robot grasping task witnesses an 83% success\nrate, proving that refractive flow can help enable direct sim-to-real transfer.\nThe code, data, and supplementary materials are available at\nhttps://rftrans.robotflow.ai.\n","authors":["Tutian Tang","Jiyu Liu","Jieyi Zhang","Haoyuan Fu","Wenqiang Xu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2311.12398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09815v3","updated":"2023-11-21T07:19:03Z","published":"2023-07-19T08:03:53Z","title":"LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network","summary":" Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent\nblur is a challenging task.~Existing blur map-based deblurring methods have\ndemonstrated promising results. In this paper, we propose, to the best of our\nknowledge, the first framework that introduces the contrastive language-image\npre-training framework (CLIP) to accurately estimate the blur map from a DP\npair unsupervisedly. To achieve this, we first carefully design text prompts to\nenable CLIP to understand blur-related geometric prior knowledge from the DP\npair. Then, we propose a format to input a stereo DP pair to CLIP without any\nfine-tuning, despite the fact that CLIP is pre-trained on monocular images.\nGiven the estimated blur map, we introduce a blur-prior attention block, a\nblur-weighting loss, and a blur-aware loss to recover the all-in-focus image.\nOur method achieves state-of-the-art performance in extensive experiments (see\nFig.~\\ref{fig:teaser}).\n","authors":["Hao Yang","Liyuan Pan","Yan Yang","Richard Hartley","Miaomiao Liu"],"pdf_url":"https://arxiv.org/pdf/2307.09815v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12397v1","updated":"2023-11-21T07:12:40Z","published":"2023-11-21T07:12:40Z","title":"Rich and Poor Texture Contrast: A Simple yet Effective Approach for\n AI-generated Image Detection","summary":" Recent generative models show impressive performance in generating\nphotographic images. Humans can hardly distinguish such incredibly\nrealistic-looking AI-generated images from real ones. AI-generated images may\nlead to ubiquitous disinformation dissemination. Therefore, it is of utmost\nurgency to develop a detector to identify AI-generated images. Most existing\ndetectors suffer from sharp performance drops over unseen generative models. In\nthis paper, we propose a novel AI-generated image detector capable of\nidentifying fake images created by a wide range of generative models. Our\napproach leverages the inter-pixel correlation contrast between rich and poor\ntexture regions within an image. Pixels in rich texture regions exhibit more\nsignificant fluctuations than those in poor texture regions. This discrepancy\nreflects that the entropy of rich texture regions is larger than that of poor\nones. Consequently, synthesizing realistic rich texture regions proves to be\nmore challenging for existing generative models. Based on this principle, we\ndivide an image into multiple patches and reconstruct them into two images,\ncomprising rich-texture and poor-texture patches respectively. Subsequently, we\nextract the inter-pixel correlation discrepancy feature between rich and poor\ntexture regions. This feature serves as a universal fingerprint used for\nAI-generated image forensics across different generative models. In addition,\nwe build a comprehensive AI-generated image detection benchmark, which includes\n16 kinds of prevalent generative models, to evaluate the effectiveness of\nexisting baselines and our approach. Our benchmark provides a leaderboard for\nfollow-up studies. Extensive experimental results show that our approach\noutperforms state-of-the-art baselines by a significant margin. Our project:\nhttps://fdmas.github.io/AIGCDetect/\n","authors":["Nan Zhong","Yiran Xu","Zhenxing Qian","Xinpeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12397v1.pdf","comment":"Our project: https://fdmas.github.io/AIGCDetect/"},{"id":"http://arxiv.org/abs/2311.11439v2","updated":"2023-11-21T07:12:22Z","published":"2023-11-19T22:24:19Z","title":"Improved Defect Detection and Classification Method for Advanced IC\n Nodes by Using Slicing Aided Hyper Inference with Refinement Strategy","summary":" In semiconductor manufacturing, lithography has often been the manufacturing\nstep defining the smallest possible pattern dimensions. In recent years,\nprogress has been made towards high-NA (Numerical Aperture) EUVL\n(Extreme-Ultraviolet-Lithography) paradigm, which promises to advance pattern\nshrinking (2 nm node and beyond). However, a significant increase in stochastic\ndefects and the complexity of defect detection becomes more pronounced with\nhigh-NA. Present defect inspection techniques (both non-machine learning and\nmachine learning based), fail to achieve satisfactory performance at high-NA\ndimensions. In this work, we investigate the use of the Slicing Aided Hyper\nInference (SAHI) framework for improving upon current techniques. Using SAHI,\ninference is performed on size-increased slices of the SEM images. This leads\nto the object detector's receptive field being more effective in capturing\nsmall defect instances. First, the performance on previously investigated\nsemiconductor datasets is benchmarked across various configurations, and the\nSAHI approach is demonstrated to substantially enhance the detection of small\ndefects, by approx. 2x. Afterwards, we also demonstrated application of SAHI\nleads to flawless detection rates on a new test dataset, with scenarios not\nencountered during training, whereas previous trained models failed. Finally,\nwe formulate an extension of SAHI that does not significantly reduce\ntrue-positive predictions while eliminating false-positive predictions.\n","authors":["Vic De Ridder","Bappaditya Dey","Victor Blanco","Sandip Halder","Bartel Van Waeyenberge"],"pdf_url":"https://arxiv.org/pdf/2311.11439v2.pdf","comment":"12 pages, 9 figures, to be presented at International Conference on\n Machine Intelligence with Applications (ICMIA), and to be published in\n conference proceedings by AIP"},{"id":"http://arxiv.org/abs/2311.12391v1","updated":"2023-11-21T07:02:32Z","published":"2023-11-21T07:02:32Z","title":"From Wrong To Right: A Recursive Approach Towards Vision-Language\n Explanation","summary":" Addressing the challenge of adapting pre-trained vision-language models for\ngenerating insightful explanations for visual reasoning tasks with limited\nannotations, we present ReVisE: a $\\textbf{Re}$cursive $\\textbf{Vis}$ual\n$\\textbf{E}$xplanation algorithm. Our method iteratively computes visual\nfeatures (conditioned on the text input), an answer, and an explanation, to\nimprove the explanation quality step by step until the answer converges. We\nfind that this multi-step approach guides the model to correct its own answers\nand outperforms single-step explanation generation. Furthermore, explanations\ngenerated by ReVisE also serve as valuable annotations for few-shot\nself-training. Our approach outperforms previous methods while utilizing merely\n5% of the human-annotated explanations across 10 metrics, demonstrating up to a\n4.2 and 1.3 increase in BLEU-1 score on the VCR and VQA-X datasets,\nunderscoring the efficacy and data-efficiency of our method.\n","authors":["Jiaxin Ge","Sanjay Subramanian","Trevor Darrell","Boyi Li"],"pdf_url":"https://arxiv.org/pdf/2311.12391v1.pdf","comment":"EMNLP 2023 Main"},{"id":"http://arxiv.org/abs/2311.12386v1","updated":"2023-11-21T06:55:21Z","published":"2023-11-21T06:55:21Z","title":"Point, Segment and Count: A Generalized Framework for Object Counting","summary":" Class-agnostic object counting aims to count all objects in an image with\nrespect to example boxes or class names, \\emph{a.k.a} few-shot and zero-shot\ncounting. Current state-of-the-art methods highly rely on density maps to\npredict object counts, which lacks model interpretability. In this paper, we\npropose a generalized framework for both few-shot and zero-shot object counting\nbased on detection. Our framework combines the superior advantages of two\nfoundation models without compromising their zero-shot capability: (\\textbf{i})\nSAM to segment all possible objects as mask proposals, and (\\textbf{ii}) CLIP\nto classify proposals to obtain accurate object counts. However, this strategy\nmeets the obstacles of efficiency overhead and the small crowded objects that\ncannot be localized and distinguished. To address these issues, our framework,\ntermed PseCo, follows three steps: point, segment, and count. Specifically, we\nfirst propose a class-agnostic object localization to provide accurate but\nleast point prompts for SAM, which consequently not only reduces computation\ncosts but also avoids missing small objects. Furthermore, we propose a\ngeneralized object classification that leverages CLIP image/text embeddings as\nthe classifier, following a hierarchical knowledge distillation to obtain\ndiscriminative classifications among hierarchical mask proposals. Extensive\nexperimental results on FSC-147 dataset demonstrate that PseCo achieves\nstate-of-the-art performance in both few-shot/zero-shot object\ncounting/detection, with additional results on large-scale COCO and LVIS\ndatasets. The source code is available at\n\\url{https://github.com/Hzzone/PseCo}.\n","authors":["Huang Zhizhong","Dai Mingliang","Zhang Yi","Zhang Junping","Shan Hongming"],"pdf_url":"https://arxiv.org/pdf/2311.12386v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11013v2","updated":"2023-11-21T06:18:25Z","published":"2023-11-18T08:48:58Z","title":"Implicit Event-RGBD Neural SLAM","summary":" Implicit neural SLAM has achieved remarkable progress recently. Nevertheless,\nexisting methods face significant challenges in non-ideal scenarios, such as\nmotion blur or lighting variation, which often leads to issues like convergence\nfailures, localization drifts, and distorted mapping. To address these\nchallenges, we propose $\\textbf{EN-SLAM}$, the first event-RGBD implicit neural\nSLAM framework, which effectively leverages the high rate and high dynamic\nrange advantages of event data for tracking and mapping. Specifically, EN-SLAM\nproposes a differentiable CRF (Camera Response Function) rendering technique to\ngenerate distinct RGB and event camera data via a shared radiance field, which\nis optimized by learning a unified implicit representation with the captured\nevent and RGBD supervision. Moreover, based on the temporal difference property\nof events, we propose a temporal aggregating optimization strategy for the\nevent joint tracking and global bundle adjustment, capitalizing on the\nconsecutive difference constraints of events, significantly enhancing tracking\naccuracy and robustness. Finally, we construct the simulated dataset\n$\\textbf{DEV-Indoors}$ and real captured dataset $\\textbf{DEV-Reals}$\ncontaining 6 scenes, 17 sequences with practical motion blur and lighting\nchanges for evaluations. Experimental results show that our method outperforms\nthe SOTA methods in both tracking ATE and mapping ACC with a real-time $17$ FPS\nin various challenging environments. The code and dataset will be released\nsoon.\n","authors":["Delin Qu","Chi Yan","Dong Wang","Jie Yin","Dan Xu","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2311.11013v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12364v1","updated":"2023-11-21T05:55:39Z","published":"2023-11-21T05:55:39Z","title":"Semi-supervised Medical Image Segmentation via Query Distribution\n Consistency","summary":" Semi-supervised learning is increasingly popular in medical image\nsegmentation due to its ability to leverage large amounts of unlabeled data to\nextract additional information. However, most existing semi-supervised\nsegmentation methods focus only on extracting information from unlabeled data.\nIn this paper, we propose a novel Dual KMax UX-Net framework that leverages\nlabeled data to guide the extraction of information from unlabeled data. Our\napproach is based on a mutual learning strategy that incorporates two modules:\n3D UX-Net as our backbone meta-architecture and KMax decoder to enhance the\nsegmentation performance. Extensive experiments on the Atrial Segmentation\nChallenge dataset have shown that our method can significantly improve\nperformance by merging unlabeled data. Meanwhile, our framework outperforms\nstate-of-the-art semi-supervised learning methods on 10\\% and 20\\% labeled\nsettings. Code located at: https://github.com/Rows21/DK-UXNet.\n","authors":["Rong Wu","Dehua Li","Cong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12364v1.pdf","comment":"Submitted to IEEE ISBI 2024"},{"id":"http://arxiv.org/abs/2310.17949v2","updated":"2023-11-21T05:55:10Z","published":"2023-10-27T07:44:25Z","title":"Instance Segmentation under Occlusions via Location-aware Copy-Paste\n Data Augmentation","summary":" Occlusion is a long-standing problem in computer vision, particularly in\ninstance segmentation. ACM MMSports 2023 DeepSportRadar has introduced a\ndataset that focuses on segmenting human subjects within a basketball context\nand a specialized evaluation metric for occlusion scenarios. Given the modest\nsize of the dataset and the highly deformable nature of the objects to be\nsegmented, this challenge demands the application of robust data augmentation\ntechniques and wisely-chosen deep learning architectures. Our work (ranked 1st\nin the competition) first proposes a novel data augmentation technique, capable\nof generating more training samples with wider distribution. Then, we adopt a\nnew architecture - Hybrid Task Cascade (HTC) framework with CBNetV2 as backbone\nand MaskIoU head to improve segmentation performance. Furthermore, we employ a\nStochastic Weight Averaging (SWA) training strategy to improve the model's\ngeneralization. As a result, we achieve a remarkable occlusion score (OM) of\n0.533 on the challenge dataset, securing the top-1 position on the leaderboard.\nSource code is available at this\nhttps://github.com/nguyendinhson-kaist/MMSports23-Seg-AutoID.\n","authors":["Son Nguyen","Mikel Lainsa","Hung Dao","Daeyoung Kim","Giang Nguyen"],"pdf_url":"https://arxiv.org/pdf/2310.17949v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12359v1","updated":"2023-11-21T05:27:16Z","published":"2023-11-21T05:27:16Z","title":"Post-Training Quantization with Low-precision Minifloats and Integers on\n FPGAs","summary":" Post-Training Quantization (PTQ) is a powerful technique for model\ncompression, reducing the precision of neural networks without additional\ntraining overhead. Recent works have investigated adopting 8-bit floating-point\nquantization (FP8) in the context of PTQ for model inference. However, the\nexploration of floating-point formats smaller than 8 bits and their comparison\nwith integer quantization remains relatively limited. In this work, we present\nminifloats, which are reduced-precision floating-point formats capable of\nfurther reducing the memory footprint, latency, and energy cost of a model\nwhile approaching full-precision model accuracy. Our work presents a novel PTQ\ndesign-space exploration, comparing minifloat and integer quantization schemes\nacross a range of 3 to 8 bits for both weights and activations. We examine the\napplicability of various PTQ techniques to minifloats, including weight\nequalization, bias correction, SmoothQuant, gradient-based learned rounding,\nand the GPTQ method. Our experiments validate the effectiveness of\nlow-precision minifloats when compared to their integer counterparts across a\nspectrum of accuracy-precision trade-offs on a set of reference deep learning\nvision workloads. Finally, we evaluate our results against an FPGA-based\nhardware cost model, showing that integer quantization often remains the\nPareto-optimal option, given its relatively smaller hardware resource\nfootprint.\n","authors":["Shivam Aggarwal","Alessandro Pappalardo","Hans Jakob Damsgaard","Giuseppe Franco","Thomas B. Preußer","Michaela Blott","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.12359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12345v1","updated":"2023-11-21T04:38:21Z","published":"2023-11-21T04:38:21Z","title":"Stable Diffusion For Aerial Object Detection","summary":" Aerial object detection is a challenging task, in which one major obstacle\nlies in the limitations of large-scale data collection and the long-tail\ndistribution of certain classes. Synthetic data offers a promising solution,\nespecially with recent advances in diffusion-based methods like stable\ndiffusion (SD). However, the direct application of diffusion methods to aerial\ndomains poses unique challenges: stable diffusion's optimization for rich\nground-level semantics doesn't align with the sparse nature of aerial objects,\nand the extraction of post-synthesis object coordinates remains problematic. To\naddress these challenges, we introduce a synthetic data augmentation framework\ntailored for aerial images. It encompasses sparse-to-dense region of interest\n(ROI) extraction to bridge the semantic gap, fine-tuning the diffusion model\nwith low-rank adaptation (LORA) to circumvent exhaustive retraining, and\nfinally, a Copy-Paste method to compose synthesized objects with backgrounds,\nproviding a nuanced approach to aerial object detection through synthetic data.\n","authors":["Yanan Jian","Fuxun Yu","Simranjit Singh","Dimitrios Stamoulis"],"pdf_url":"https://arxiv.org/pdf/2311.12345v1.pdf","comment":"Accepted at NeurIPS 2023 Synthetic Data Generation with Generative AI\n workshop"},{"id":"http://arxiv.org/abs/2311.12344v1","updated":"2023-11-21T04:32:28Z","published":"2023-11-21T04:32:28Z","title":"Modality Mixer Exploiting Complementary Information for Multi-modal\n Action Recognition","summary":" Due to the distinctive characteristics of sensors, each modality exhibits\nunique physical properties. For this reason, in the context of multi-modal\naction recognition, it is important to consider not only the overall action\ncontent but also the complementary nature of different modalities. In this\npaper, we propose a novel network, named Modality Mixer (M-Mixer) network,\nwhich effectively leverages and incorporates the complementary information\nacross modalities with the temporal context of actions for action recognition.\nA key component of our proposed M-Mixer is the Multi-modal Contextualization\nUnit (MCU), a simple yet effective recurrent unit. Our MCU is responsible for\ntemporally encoding a sequence of one modality (e.g., RGB) with action content\nfeatures of other modalities (e.g., depth and infrared modalities). This\nprocess encourages M-Mixer network to exploit global action content and also to\nsupplement complementary information of other modalities. Furthermore, to\nextract appropriate complementary information regarding to the given modality\nsettings, we introduce a new module, named Complementary Feature Extraction\nModule (CFEM). CFEM incorporates sepearte learnable query embeddings for each\nmodality, which guide CFEM to extract complementary information and global\naction content from the other modalities. As a result, our proposed method\noutperforms state-of-the-art methods on NTU RGB+D 60, NTU RGB+D 120, and\nNW-UCLA datasets. Moreover, through comprehensive ablation studies, we further\nvalidate the effectiveness of our proposed method.\n","authors":["Sumin Lee","Sangmin Woo","Muhammad Adi Nugroho","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2311.12344v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2208.11314"},{"id":"http://arxiv.org/abs/2311.12342v1","updated":"2023-11-21T04:28:12Z","published":"2023-11-21T04:28:12Z","title":"LoCo: Locally Constrained Training-Free Layout-to-Image Synthesis","summary":" Recent text-to-image diffusion models have reached an unprecedented level in\ngenerating high-quality images. However, their exclusive reliance on textual\nprompts often falls short in accurately conveying fine-grained spatial\ncompositions. In this paper, we propose LoCo, a training-free approach for\nlayout-to-image synthesis that excels in producing high-quality images aligned\nwith both textual prompts and spatial layouts. Our method introduces a\nLocalized Attention Constraint to refine cross-attention for individual\nobjects, ensuring their precise placement in designated regions. We further\npropose a Padding Token Constraint to leverage the semantic information\nembedded in previously neglected padding tokens, thereby preventing the\nundesired fusion of synthesized objects. LoCo seamlessly integrates into\nexisting text-to-image and layout-to-image models, significantly amplifying\ntheir performance and effectively addressing semantic failures observed in\nprior methods. Through extensive experiments, we showcase the superiority of\nour approach, surpassing existing state-of-the-art training-free\nlayout-to-image methods both qualitatively and quantitatively across multiple\nbenchmarks.\n","authors":["Peiang Zhao","Han Li","Ruiyang Jin","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.12342v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2206.10996v4","updated":"2023-11-21T04:18:38Z","published":"2022-06-22T11:55:53Z","title":"ProtoCLIP: Prototypical Contrastive Language Image Pretraining","summary":" Contrastive Language Image Pretraining (CLIP) has received widespread\nattention, since its learned representations can be transferred well to various\ndownstream tasks. During the training process of the CLIP model, the InfoNCE\nobjective aligns positive image-text pairs and separates negative ones. We show\nan underlying representation grouping effect during this process: the InfoNCE\nobjective indirectly groups semantically similar representations together via\nrandomly emerged within-modal anchors. Based on this understanding, in this\npaper, Prototypical Contrastive Language Image Pretraining (ProtoCLIP) is\nintroduced to enhance such grouping by boosting its efficiency and increasing\nits robustness against the modality gap. Specifically, ProtoCLIP sets up\nprototype-level discrimination between image and text spaces, which efficiently\ntransfers higher-level structural knowledge. Further, Prototypical Back\nTranslation (PBT) is proposed to decouple representation grouping from\nrepresentation alignment, resulting in effective learning of meaningful\nrepresentations under large modality gap. The PBT also enables us to introduce\nadditional external teachers with richer prior language knowledge. ProtoCLIP is\ntrained with an online episodic training strategy, which makes it can be scaled\nup to unlimited amounts of data. We train our ProtoCLIP on Conceptual Captions\nand achieved an +5.81% ImageNet linear probing improvement and an +2.01%\nImageNet zero-shot classification improvement. On the larger YFCC-15M dataset,\nProtoCLIP matches the performance of CLIP with 33% of training time. Codes are\navailable at https://github.com/megvii-research/protoclip.\n","authors":["Delong Chen","Zhao Wu","Fan Liu","Zaiquan Yang","Huaxi Huang","Ying Tan","Erjin Zhou"],"pdf_url":"https://arxiv.org/pdf/2206.10996v4.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems\n (TNNLS)"},{"id":"http://arxiv.org/abs/2309.04342v3","updated":"2023-11-21T04:10:20Z","published":"2023-09-08T14:12:03Z","title":"Revealing the preference for correcting separated aberrations in joint\n optic-image design","summary":" The joint design of the optical system and the downstream algorithm is a\nchallenging and promising task. Due to the demand for balancing the global\noptimal of imaging systems and the computational cost of physical simulation,\nexisting methods cannot achieve efficient joint design of complex systems such\nas smartphones and drones. In this work, starting from the perspective of the\noptical design, we characterize the optics with separated aberrations.\nAdditionally, to bridge the hardware and software without gradients, an image\nsimulation system is presented to reproduce the genuine imaging procedure of\nlenses with large field-of-views. As for aberration correction, we propose a\nnetwork to perceive and correct the spatially varying aberrations and validate\nits superiority over state-of-the-art methods. Comprehensive experiments reveal\nthat the preference for correcting separated aberrations in joint design is as\nfollows: longitudinal chromatic aberration, lateral chromatic aberration,\nspherical aberration, field curvature, and coma, with astigmatism coming last.\nDrawing from the preference, a 10% reduction in the total track length of the\nconsumer-level mobile phone lens module is accomplished. Moreover, this\nprocedure spares more space for manufacturing deviations, realizing\nextreme-quality enhancement of computational photography. The optimization\nparadigm provides innovative insight into the practical joint design of\nsophisticated optical systems and post-processing algorithms.\n","authors":["Jingwen Zhou","Shiqi Chen","Zheng Ren","Wenguan Zhang","Jiapu Yan","Huajun Feng","Qi Li","Yueting Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04342v3.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2311.11354v2","updated":"2023-11-21T03:45:32Z","published":"2023-11-19T15:35:15Z","title":"Scale-aware competition network for palmprint recognition","summary":" Palmprint biometrics garner heightened attention in palm-scanning payment and\nsocial security due to their distinctive attributes. However, prevailing\nmethodologies singularly prioritize texture orientation, neglecting the\nsignificant texture scale dimension. We design an innovative network for\nconcurrently extracting intra-scale and inter-scale features to redress this\nlimitation. This paper proposes a scale-aware competitive network (SAC-Net),\nwhich includes the Inner-Scale Competition Module (ISCM) and the Across-Scale\nCompetition Module (ASCM) to capture texture characteristics related to\norientation and scale. ISCM efficiently integrates learnable Gabor filters and\na self-attention mechanism to extract rich orientation data and discern\ntextures with long-range discriminative properties. Subsequently, ASCM\nleverages a competitive strategy across various scales to effectively\nencapsulate the competitive texture scale elements. By synergizing ISCM and\nASCM, our method adeptly characterizes palmprint features. Rigorous\nexperimentation across three benchmark datasets unequivocally demonstrates our\nproposed approach's exceptional recognition performance and resilience relative\nto state-of-the-art alternatives.\n","authors":["Chengrui Gao","Ziyuan Yang","Min Zhu","Andrew Beng Jin Teoh"],"pdf_url":"https://arxiv.org/pdf/2311.11354v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12327v1","updated":"2023-11-21T03:40:09Z","published":"2023-11-21T03:40:09Z","title":"ViLaM: A Vision-Language Model with Enhanced Visual Grounding and\n Generalization Capability","summary":" Vision-language models have revolutionized human-computer interaction and\nshown significant progress in multi-modal tasks. However, applying these models\nto complex visual tasks like medical image analysis remains challenging. In\nthis study, we propose ViLaM, a unified Vision-Language transformer model that\nintegrates instruction tuning predicated on a large language model. This\napproach enables us to optimally utilize the knowledge and reasoning capacities\nof large pre-trained language models for an array of tasks encompassing both\nlanguage and vision. We employ frozen pre-trained encoders to encode and align\nboth image and text features, enabling ViLaM to handle a variety of visual\ntasks following textual instructions. Besides, we've designed cycle training\nfor referring expressions to address the need for high-quality, paired\nreferring expression datasets for training large models in terms of both\nquantity and quality. We evaluated ViLaM's exceptional performance on public\ngeneral datasets and further confirmed its generalizability on medical\ndatasets. Importantly, we've observed the model's impressive zero-shot learning\nability, indicating the potential future application of ViLaM in the medical\nfield.\n","authors":["Xiaoyu Yang","Lijian Xu","Hongsheng Li","Shaoting Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12327v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12316v1","updated":"2023-11-21T03:25:51Z","published":"2023-11-21T03:25:51Z","title":"Overcoming Pathology Image Data Deficiency: Generating Images from\n Pathological Transformation Process","summary":" Histopathology serves as the gold standard for medical diagnosis but faces\napplication limitations due to the shortage of medical resources. Leveraging\ndeep learning, computer-aided diagnosis has the potential to alleviate the\npathologist scarcity and provide timely clinical analysis. However, developing\na reliable model generally necessitates substantial data for training, which is\nchallenging in pathological field. In response, we propose an adaptive\ndepth-controlled bidirectional diffusion (ADBD) network for image data\ngeneration. The domain migration approach can work with small trainset and\novercome the diffusion overfitting by source information guidance.\nSpecifically, we developed a hybrid attention strategy to blend global and\nlocal attention priorities, which guides the bidirectional diffusion and\nensures the migration success. In addition, we developed the adaptive\ndepth-controlled strategy to simulate physiological transformations, capable of\nyielding unlimited cross-domain intermediate images with corresponding soft\nlabels. ADBD is effective for overcoming pathological image data deficiency and\nsupportable for further pathology-related research.\n","authors":["Zeyu Liu","Yufang He","Yu Zhao","Yunlu Feng","Guanglei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18639v2","updated":"2023-11-21T03:23:39Z","published":"2023-10-28T08:48:44Z","title":"Towards Plastic and Stable Exemplar-Free Incremental Learning: A\n Dual-Learner Framework with Cumulative Parameter Averaging","summary":" The dilemma between plasticity and stability presents a significant challenge\nin Incremental Learning (IL), especially in the exemplar-free scenario where\naccessing old-task samples is strictly prohibited during the learning of a new\ntask. A straightforward solution to this issue is learning and storing an\nindependent model for each task, known as Single Task Learning (STL). Despite\nthe linear growth in model storage with the number of tasks in STL, we\nempirically discover that averaging these model parameters can potentially\npreserve knowledge across all tasks. Inspired by this observation, we propose a\nDual-Learner framework with Cumulative Parameter Averaging (DLCPA). DLCPA\nemploys a dual-learner design: a plastic learner focused on acquiring new-task\nknowledge and a stable learner responsible for accumulating all learned\nknowledge. The knowledge from the plastic learner is transferred to the stable\nlearner via cumulative parameter averaging. Additionally, several task-specific\nclassifiers work in cooperation with the stable learner to yield the final\nprediction. Specifically, when learning a new task, these modules are updated\nin a cyclic manner: i) the plastic learner is initially optimized using a\nself-supervised loss besides the supervised loss to enhance the feature\nextraction robustness; ii) the stable learner is then updated with respect to\nthe plastic learner in a cumulative parameter averaging manner to maintain its\ntask-wise generalization; iii) the task-specific classifier is accordingly\noptimized to align with the stable learner. Experimental results on CIFAR-100\nand Tiny-ImageNet show that DLCPA outperforms several state-of-the-art\nexemplar-free baselines in both Task-IL and Class-IL settings.\n","authors":["Wenju Sun","Qingyong Li","Wen Wang","Yangli-ao Geng"],"pdf_url":"https://arxiv.org/pdf/2310.18639v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12311v1","updated":"2023-11-21T03:03:22Z","published":"2023-11-21T03:03:22Z","title":"ABFL: Angular Boundary Discontinuity Free Loss for Arbitrary Oriented\n Object Detection in Aerial Images","summary":" Arbitrary oriented object detection (AOOD) in aerial images is a widely\nconcerned and highly challenging task, and plays an important role in many\nscenarios. The core of AOOD involves the representation, encoding, and feature\naugmentation of oriented bounding-boxes (Bboxes). Existing methods lack\nintuitive modeling of angle difference measurement in oriented Bbox\nrepresentations. Oriented Bboxes under different representations exhibit\nrotational symmetry with varying periods due to angle periodicity. The angular\nboundary discontinuity (ABD) problem at periodic boundary positions is caused\nby rotational symmetry in measuring angular differences. In addition, existing\nmethods also use additional encoding-decoding structures for oriented Bboxes.\nIn this paper, we design an angular boundary free loss (ABFL) based on the von\nMises distribution. The ABFL aims to solve the ABD problem when detecting\noriented objects. Specifically, ABFL proposes to treat angles as circular data\nrather than linear data when measuring angle differences, aiming to introduce\nangle periodicity to alleviate the ABD problem and improve the accuracy of\nangle difference measurement. In addition, ABFL provides a simple and effective\nsolution for various periodic boundary discontinuities caused by rotational\nsymmetry in AOOD tasks, as it does not require additional encoding-decoding\nstructures for oriented Bboxes. Extensive experiments on the DOTA and HRSC2016\ndatasets show that the proposed ABFL loss outperforms some state-of-the-art\nmethods focused on addressing the ABD problem.\n","authors":["Zifei Zhao","Shengyang Li"],"pdf_url":"https://arxiv.org/pdf/2311.12311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12685v3","updated":"2023-11-21T02:57:09Z","published":"2023-06-22T06:12:23Z","title":"Rethinking the Backward Propagation for Adversarial Transferability","summary":" Transfer-based attacks generate adversarial examples on the surrogate model,\nwhich can mislead other black-box models without access, making it promising to\nattack real-world applications. Recently, several works have been proposed to\nboost adversarial transferability, in which the surrogate model is usually\noverlooked. In this work, we identify that non-linear layers (e.g., ReLU,\nmax-pooling, etc.) truncate the gradient during backward propagation, making\nthe gradient w.r.t. input image imprecise to the loss function. We hypothesize\nand empirically validate that such truncation undermines the transferability of\nadversarial examples. Based on these findings, we propose a novel method called\nBackward Propagation Attack (BPA) to increase the relevance between the\ngradient w.r.t. input image and loss function so as to generate adversarial\nexamples with higher transferability. Specifically, BPA adopts a non-monotonic\nfunction as the derivative of ReLU and incorporates softmax with temperature to\nsmooth the derivative of max-pooling, thereby mitigating the information loss\nduring the backward propagation of gradients. Empirical results on the ImageNet\ndataset demonstrate that not only does our method substantially boost the\nadversarial transferability, but it is also general to existing transfer-based\nattacks. Code is available at https://github.com/Trustworthy-AI-Group/RPA.\n","authors":["Xiaosen Wang","Kangheng Tong","Kun He"],"pdf_url":"https://arxiv.org/pdf/2306.12685v3.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.12300v1","updated":"2023-11-21T02:36:47Z","published":"2023-11-21T02:36:47Z","title":"Challenges in Video-Based Infant Action Recognition: A Critical\n Examination of the State of the Art","summary":" Automated human action recognition, a burgeoning field within computer\nvision, boasts diverse applications spanning surveillance, security,\nhuman-computer interaction, tele-health, and sports analysis. Precise action\nrecognition in infants serves a multitude of pivotal purposes, encompassing\nsafety monitoring, developmental milestone tracking, early intervention for\ndevelopmental delays, fostering parent-infant bonds, advancing computer-aided\ndiagnostics, and contributing to the scientific comprehension of child\ndevelopment. This paper delves into the intricacies of infant action\nrecognition, a domain that has remained relatively uncharted despite the\naccomplishments in adult action recognition. In this study, we introduce a\ngroundbreaking dataset called ``InfActPrimitive'', encompassing five\nsignificant infant milestone action categories, and we incorporate specialized\npreprocessing for infant data. We conducted an extensive comparative analysis\nemploying cutting-edge skeleton-based action recognition models using this\ndataset. Our findings reveal that, although the PoseC3D model achieves the\nhighest accuracy at approximately 71%, the remaining models struggle to\naccurately capture the dynamics of infant actions. This highlights a\nsubstantial knowledge gap between infant and adult action recognition domains\nand the urgent need for data-efficient pipeline models.\n","authors":["Elaheh Hatamimajoumerd","Pooria Daneshvar Kakhaki","Xiaofei Huang","Lingfei Luan","Somaieh Amraee","Sarah Ostadabbas"],"pdf_url":"https://arxiv.org/pdf/2311.12300v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10899v2","updated":"2023-11-21T02:16:27Z","published":"2023-11-17T22:44:05Z","title":"Extraction and Summarization of Explicit Video Content using Multi-Modal\n Deep Learning","summary":" With the increase in video-sharing platforms across the internet, it is\ndifficult for humans to moderate the data for explicit content. Hence, an\nautomated pipeline to scan through video data for explicit content has become\nthe need of the hour. We propose a novel pipeline that uses multi-modal deep\nlearning to first extract the explicit segments of input videos and then\nsummarize their content using text to determine its age appropriateness and age\nrating. We also evaluate our pipeline's effectiveness in the end using standard\nmetrics.\n","authors":["Shaunak Joshi","Raghav Gaggar"],"pdf_url":"https://arxiv.org/pdf/2311.10899v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.12291v1","updated":"2023-11-21T02:14:16Z","published":"2023-11-21T02:14:16Z","title":"Instance-aware 3D Semantic Segmentation powered by Shape Generators and\n Classifiers","summary":" Existing 3D semantic segmentation methods rely on point-wise or voxel-wise\nfeature descriptors to output segmentation predictions. However, these\ndescriptors are often supervised at point or voxel level, leading to\nsegmentation models that can behave poorly at instance-level. In this paper, we\nproposed a novel instance-aware approach for 3D semantic segmentation. Our\nmethod combines several geometry processing tasks supervised at instance-level\nto promote the consistency of the learned feature representation. Specifically,\nour methods use shape generators and shape classifiers to perform shape\nreconstruction and classification tasks for each shape instance. This enforces\nthe feature representation to faithfully encode both structural and local shape\ninformation, with an awareness of shape instances. In the experiments, our\nmethod significantly outperform existing approaches in 3D semantic segmentation\non several public benchmarks, such as Waymo Open Dataset, SemanticKITTI and\nScanNetV2.\n","authors":["Bo Sun","Qixing Huang","Xiangru Huang"],"pdf_url":"https://arxiv.org/pdf/2311.12291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12272v1","updated":"2023-11-21T01:25:24Z","published":"2023-11-21T01:25:24Z","title":"Procedural Generation of Grain Orientations using the Wave Function\n Collapse Algorithm","summary":" Statistics of grain sizes and orientations in metals correlate to the\nmaterial's mechanical properties. Reproducing representative volume elements\nfor further analysis of deformation and failure in metals, like 316L stainless\nsteel, is particularly important due to their wide use in manufacturing goods\ntoday. Two approaches, initially created for video games, were considered for\nthe procedural generation of representative grain microstructures. The first is\nthe Wave Function Collapse (WFC) algorithm, and the second is constraint\npropagation and probabilistic inference through Markov Junior, a free and\nopen-source software. This study aimed to investigate these two algorithms'\neffectiveness in using reference electron backscatter diffraction (EBSD) maps\nand recreating a statistically similar one that could be used in further\nresearch. It utilized two stainless steel EBSD maps as references to test both\nalgorithms. First, the WFC algorithm was too constricting and, thus, incapable\nof producing images that resembled EBSDs. The second, MarkovJunior, was much\nmore effective in creating a Voronoi tessellation that could be used to create\nan EBSD map in Python. When comparing the results between the reference and the\ngenerated EBSD, we discovered that the orientation and volume fractions were\nextremely similar. With the study, it was concluded that MarkovJunior is an\neffective machine learning tool that can reproduce representative grain\nmicrostructures.\n","authors":["G. Magny-Fokam","D. Madisetti","J. El-Awady"],"pdf_url":"https://arxiv.org/pdf/2311.12272v1.pdf","comment":"6 pages, 18 figures"},{"id":"http://arxiv.org/abs/2311.12268v1","updated":"2023-11-21T01:18:23Z","published":"2023-11-21T01:18:23Z","title":"Boosting Audio-visual Zero-shot Learning with Large Language Models","summary":" Audio-visual zero-shot learning aims to recognize unseen categories based on\npaired audio-visual sequences. Recent methods mainly focus on learning aligned\nand discriminative multi-modal features to boost generalization towards unseen\ncategories. However, these approaches ignore the obscure action concepts in\ncategory names and may inevitably introduce complex network structures with\ndifficult training objectives. In this paper, we propose a simple yet effective\nframework named Knowledge-aware Distribution Adaptation (KDA) to help the model\nbetter grasp the novel action contents with an external knowledge base.\nSpecifically, we first propose using large language models to generate rich\ndescriptions from category names, which leads to a better understanding of\nunseen categories. Additionally, we propose a distribution alignment loss as\nwell as a knowledge-aware adaptive margin loss to further improve the\ngeneralization ability towards unseen categories. Extensive experimental\nresults demonstrate that our proposed KDA can outperform state-of-the-art\nmethods on three popular audio-visual zero-shot learning datasets. Our code\nwill be avaliable at \\url{https://github.com/chenhaoxing/KDA}.\n","authors":["Haoxing Chen","Yaohui Li","Yan Hong","Zizheng Huang","Zhuoer Xu","Zhangxuan Gu","Jun Lan","Huijia Zhu","Weiqiang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12265v1","updated":"2023-11-21T01:01:08Z","published":"2023-11-21T01:01:08Z","title":"Virtual Home Staging: Inverse Rendering and Editing an Indoor Panorama\n under Natural Illumination","summary":" We propose a novel inverse rendering method that enables the transformation\nof existing indoor panoramas with new indoor furniture layouts under natural\nillumination. To achieve this, we captured indoor HDR panoramas along with\nreal-time outdoor hemispherical HDR photographs. Indoor and outdoor HDR images\nwere linearly calibrated with measured absolute luminance values for accurate\nscene relighting. Our method consists of three key components: (1) panoramic\nfurniture detection and removal, (2) automatic floor layout design, and (3)\nglobal rendering with scene geometry, new furniture objects, and a real-time\noutdoor photograph. We demonstrate the effectiveness of our workflow in\nrendering indoor scenes under different outdoor illumination conditions.\nAdditionally, we contribute a new calibrated HDR (Cali-HDR) dataset that\nconsists of 137 calibrated indoor panoramas and their associated outdoor\nphotographs. The source code and dataset are available:\nhttps://github.com/Gzhji/Cali-HDR-Dataset.\n","authors":["Guanzhou Ji","Azadeh O. Sawyer","Srinivasa G. Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2311.12265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08420v2","updated":"2023-11-21T23:40:09Z","published":"2023-10-12T15:39:54Z","title":"Visual Attention-Prompted Prediction and Learning","summary":" Explanation(attention)-guided learning is a method that enhances a model's\npredictive power by incorporating human understanding during the training\nphase. While attention-guided learning has shown promising results, it often\ninvolves time-consuming and computationally expensive model retraining. To\naddress this issue, we introduce the attention-prompted prediction technique,\nwhich enables direct prediction guided by the attention prompt without the need\nfor model retraining. However, this approach presents several challenges,\nincluding: 1) How to incorporate the visual attention prompt into the model's\ndecision-making process and leverage it for future predictions even in the\nabsence of a prompt? and 2) How to handle the incomplete information from the\nvisual attention prompt? To tackle these challenges, we propose a novel\nframework called Visual Attention-Prompted Prediction and Learning, which\nseamlessly integrates visual attention prompts into the model's decision-making\nprocess and adapts to images both with and without attention prompts for\nprediction. To address the incomplete information of the visual attention\nprompt, we introduce a perturbation-based attention map modification method.\nAdditionally, we propose an optimization-based mask aggregation method with a\nnew weight learning function for adaptive perturbed annotation aggregation in\nthe attention map modification process. Our overall framework is designed to\nlearn in an attention-prompt guided multi-task manner to enhance future\npredictions even for samples without attention prompts and trained in an\nalternating manner for better convergence. Extensive experiments conducted on\ntwo datasets demonstrate the effectiveness of our proposed framework in\nenhancing predictions for samples, both with and without provided prompts.\n","authors":["Yifei Zhang","Siyi Gu","Bo Pan","Guangji Bai","Xiaofeng Yang","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.08420v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13052v1","updated":"2023-11-21T23:25:04Z","published":"2023-11-21T23:25:04Z","title":"Novel OCT mosaicking pipeline with Feature- and Pixel-based registration","summary":" High-resolution Optical Coherence Tomography (OCT) images are crucial for\nophthalmology studies but are limited by their relatively narrow field of view\n(FoV). Image mosaicking is a technique for aligning multiple overlapping images\nto obtain a larger FoV. Current mosaicking pipelines often struggle with\nsubstantial noise and considerable displacement between the input sub-fields.\nIn this paper, we propose a versatile pipeline for stitching multi-view\nOCT/OCTA \\textit{en face} projection images. Our method combines the strengths\nof learning-based feature matching and robust pixel-based registration to align\nmultiple images effectively. Furthermore, we advance the application of a\ntrained foundational model, Segment Anything Model (SAM), to validate\nmosaicking results in an unsupervised manner. The efficacy of our pipeline is\nvalidated using an in-house dataset and a large public dataset, where our\nmethod shows superior performance in terms of both accuracy and computational\nefficiency. We also made our evaluation tool for image mosaicking and the\ncorresponding pipeline publicly available at\n\\url{https://github.com/MedICL-VU/OCT-mosaicking}.\n","authors":["Jiacheng Wang","Hao Li","Dewei Hu","Yuankai K. Tao","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2311.13052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16020v2","updated":"2023-11-21T23:23:08Z","published":"2023-09-27T20:54:56Z","title":"GeoCLIP: Clip-Inspired Alignment between Locations and Images for\n Effective Worldwide Geo-localization","summary":" Worldwide Geo-localization aims to pinpoint the precise location of images\ntaken anywhere on Earth. This task has considerable challenges due to immense\nvariation in geographic landscapes. The image-to-image retrieval-based\napproaches fail to solve this problem on a global scale as it is not feasible\nto construct a large gallery of images covering the entire world. Instead,\nexisting approaches divide the globe into discrete geographic cells,\ntransforming the problem into a classification task. However, their performance\nis limited by the predefined classes and often results in inaccurate\nlocalizations when an image's location significantly deviates from its class\ncenter. To overcome these limitations, we propose GeoCLIP, a novel\nCLIP-inspired Image-to-GPS retrieval approach that enforces alignment between\nthe image and its corresponding GPS locations. GeoCLIP's location encoder\nmodels the Earth as a continuous function by employing positional encoding\nthrough random Fourier features and constructing a hierarchical representation\nthat captures information at varying resolutions to yield a semantically rich\nhigh-dimensional feature suitable to use even beyond geo-localization. To the\nbest of our knowledge, this is the first work employing GPS encoding for\ngeo-localization. We demonstrate the efficacy of our method via extensive\nexperiments and ablations on benchmark datasets. We achieve competitive\nperformance with just 20% of training data, highlighting its effectiveness even\nin limited-data settings. Furthermore, we qualitatively demonstrate\ngeo-localization using a text query by leveraging CLIP backbone of our image\nencoder. The project webpage is available at:\nhttps://vicentevivan.github.io/GeoCLIP\n","authors":["Vicente Vivanco Cepeda","Gaurav Kumar Nayak","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2309.16020v2.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.13045v1","updated":"2023-11-21T23:14:42Z","published":"2023-11-21T23:14:42Z","title":"Camera-Independent Single Image Depth Estimation from Defocus Blur","summary":" Monocular depth estimation is an important step in many downstream tasks in\nmachine vision. We address the topic of estimating monocular depth from defocus\nblur which can yield more accurate results than the semantic based depth\nestimation methods. The existing monocular depth from defocus techniques are\nsensitive to the particular camera that the images are taken from. We show how\nseveral camera-related parameters affect the defocus blur using optical physics\nequations and how they make the defocus blur depend on these parameters. The\nsimple correction procedure we propose can alleviate this problem which does\nnot require any retraining of the original model. We created a synthetic\ndataset which can be used to test the camera independent performance of depth\nfrom defocus blur models. We evaluate our model on both synthetic and real\ndatasets (DDFF12 and NYU depth V2) obtained with different cameras and show\nthat our methods are significantly more robust to the changes of cameras. Code:\nhttps://github.com/sleekEagle/defocus_camind.git\n","authors":["Lahiru Wijayasingha","Homa Alemzadeh","John A. Stankovic"],"pdf_url":"https://arxiv.org/pdf/2311.13045v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13022v1","updated":"2023-11-21T22:05:00Z","published":"2023-11-21T22:05:00Z","title":"Unsupervised Multimodal Surface Registration with Geometric Deep\n Learning","summary":" This paper introduces GeoMorph, a novel geometric deep-learning framework\ndesigned for image registration of cortical surfaces. The registration process\nconsists of two main steps. First, independent feature extraction is performed\non each input surface using graph convolutions, generating low-dimensional\nfeature representations that capture important cortical surface\ncharacteristics. Subsequently, features are registered in a deep-discrete\nmanner to optimize the overlap of common structures across surfaces by learning\ndisplacements of a set of control points. To ensure smooth and biologically\nplausible deformations, we implement regularization through a deep conditional\nrandom field implemented with a recurrent neural network. Experimental results\ndemonstrate that GeoMorph surpasses existing deep-learning methods by achieving\nimproved alignment with smoother deformations. Furthermore, GeoMorph exhibits\ncompetitive performance compared to classical frameworks. Such versatility and\nrobustness suggest strong potential for various neuroscience applications.\n","authors":["Mohamed A. Suliman","Logan Z. J. Williams","Abdulah Fawaz","Emma C. Robinson"],"pdf_url":"https://arxiv.org/pdf/2311.13022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13018v1","updated":"2023-11-21T21:48:51Z","published":"2023-11-21T21:48:51Z","title":"Attention: Large Multimodal Model is Watching your Geo-privacy","summary":" Geographic privacy, a crucial aspect of personal security, often goes\nunnoticed in daily activities. This paper addresses the underestimation of this\nprivacy in the context of increasing online data sharing and the advancements\nin information gathering technologies. With the surge in the use of Large\nMultimodal Models, such as GPT-4, for Open Source Intelligence (OSINT), the\npotential risks associated with geographic privacy breaches have intensified.\nThis study highlights the criticality of these developments, focusing on their\nimplications for individual privacy. The primary objective is to demonstrate\nthe capabilities of advanced AI tools, specifically a GPT-4 based model named\n\"Dr. Watson,\" in identifying and potentially compromising geographic privacy\nthrough online shared content. We developed \"Dr. Watson\" to analyze and extract\ngeographic information from publicly available data sources. The study involved\nfive experimental cases, each offering different perspectives on the tool's\napplication in extracting precise location data from partial images and social\nmedia content. The experiments revealed that \"Dr. Watson\" could successfully\nidentify specific geographic details, thereby exposing the vulnerabilities in\ncurrent geo-privacy measures. These findings underscore the ease with which\ngeographic information can be unintentionally disclosed. The paper concludes\nwith a discussion on the broader implications of these findings for individuals\nand the community at large. It emphasizes the urgency for enhanced awareness\nand protective measures against geo-privacy leakage in the era of advanced AI\nand widespread social media usage.\n","authors":["Yifan Yang","Yixian Zhang","Daoyang Li","Shuju Sun","Junhong Duan","Junzhou He","Qingyang Wu","Hao Liu"],"pdf_url":"https://arxiv.org/pdf/2311.13018v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.13016v1","updated":"2023-11-21T21:44:45Z","published":"2023-11-21T21:44:45Z","title":"Image-Based Soil Organic Carbon Remote Sensing from Satellite Images\n with Fourier Neural Operator and Structural Similarity","summary":" Soil organic carbon (SOC) sequestration is the transfer and storage of\natmospheric carbon dioxide in soils, which plays an important role in climate\nchange mitigation. SOC concentration can be improved by proper land use, thus\nit is beneficial if SOC can be estimated at a regional or global scale. As\nmultispectral satellite data can provide SOC-related information such as\nvegetation and soil properties at a global scale, estimation of SOC through\nsatellite data has been explored as an alternative to manual soil sampling.\nAlthough existing studies show promising results, they are mainly based on\npixel-based approaches with traditional machine learning methods, and\nconvolutional neural networks (CNNs) are uncommon. To study the use of CNNs on\nSOC remote sensing, here we propose the FNO-DenseNet based on the Fourier\nneural operator (FNO). By combining the advantages of the FNO and DenseNet, the\nFNO-DenseNet outperformed the FNO in our experiments with hundreds of times\nfewer parameters. The FNO-DenseNet also outperformed a pixel-based random\nforest by 18% in the mean absolute percentage error.\n","authors":["Ken C. L. Wong","Levente Klein","Ademir Ferreira da Silva","Hongzhi Wang","Jitendra Singh","Tanveer Syeda-Mahmood"],"pdf_url":"https://arxiv.org/pdf/2311.13016v1.pdf","comment":"This paper was accepted by the 2023 IEEE International Geoscience and\n Remote Sensing Symposium (IGARSS 2023)"},{"id":"http://arxiv.org/abs/2311.13009v1","updated":"2023-11-21T21:36:09Z","published":"2023-11-21T21:36:09Z","title":"3D Compression Using Neural Fields","summary":" Neural Fields (NFs) have gained momentum as a tool for compressing various\ndata modalities - e.g. images and videos. This work leverages previous advances\nand proposes a novel NF-based compression algorithm for 3D data. We derive two\nversions of our approach - one tailored to watertight shapes based on Signed\nDistance Fields (SDFs) and, more generally, one for arbitrary non-watertight\nshapes using Unsigned Distance Fields (UDFs). We demonstrate that our method\nexcels at geometry compression on 3D point clouds as well as meshes. Moreover,\nwe show that, due to the NF formulation, it is straightforward to extend our\ncompression algorithm to compress both geometry and attribute (e.g. color) of\n3D data.\n","authors":["Janis Postels","Yannick Strümpler","Klara Reichard","Luc Van Gool","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2311.13009v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.01121v2","updated":"2023-11-21T21:04:24Z","published":"2023-07-03T15:51:39Z","title":"Artifacts Mapping: Multi-Modal Semantic Mapping for Object Detection and\n 3D Localization","summary":" Geometric navigation is nowadays a well-established field of robotics and the\nresearch focus is shifting towards higher-level scene understanding, such as\nSemantic Mapping. When a robot needs to interact with its environment, it must\nbe able to comprehend the contextual information of its surroundings. This work\nfocuses on classifying and localising objects within a map, which is under\nconstruction (SLAM) or already built. To further explore this direction, we\npropose a framework that can autonomously detect and localize predefined\nobjects in a known environment using a multi-modal sensor fusion approach\n(combining RGB and depth data from an RGB-D camera and a lidar). The framework\nconsists of three key elements: understanding the environment through RGB data,\nestimating depth through multi-modal sensor fusion, and managing artifacts\n(i.e., filtering and stabilizing measurements). The experiments show that the\nproposed framework can accurately detect 98% of the objects in the real sample\nenvironment, without post-processing, while 85% and 80% of the objects were\nmapped using the single RGBD camera or RGB + lidar setup respectively. The\ncomparison with single-sensor (camera or lidar) experiments is performed to\nshow that sensor fusion allows the robot to accurately detect near and far\nobstacles, which would have been noisy or imprecise in a purely visual or\nlaser-based approach.\n","authors":["Federico Rollo","Gennaro Raiola","Andrea Zunino","Nikolaos Tsagarakis","Arash Ajoudani"],"pdf_url":"https://arxiv.org/pdf/2307.01121v2.pdf","comment":"Accepted to the 11th European Conference on Mobile Robots (ECMR) 2023"},{"id":"http://arxiv.org/abs/2311.12993v1","updated":"2023-11-21T21:00:42Z","published":"2023-11-21T21:00:42Z","title":"AI for Agriculture: the Comparison of Semantic Segmentation Methods for\n Crop Mapping with Sentinel-2 Imagery","summary":" Crop mapping is one of the most common tasks in artificial intelligence for\nagriculture due to higher food demands from a growing population and increased\nawareness of climate change. In case of vineyards, the texture is very\nimportant for crop segmentation: with higher resolution satellite imagery the\ntexture is easily detected by majority of state-of-the-art algorithms. However,\nthis task becomes increasingly more difficult as the resolution of satellite\nimagery decreases and the information about the texture becomes unavailable. In\nthis paper we aim to explore the main machine learning methods that can be used\nwith freely available satellite imagery and discuss how and when they can be\napplied for vineyard segmentation problem. We assess the effectiveness of\nvarious widely-used machine learning techniques and offer guidance on selecting\nthe most suitable model for specific scenarios.\n","authors":["Irina Korotkova","Natalia Efremova"],"pdf_url":"https://arxiv.org/pdf/2311.12993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12992v1","updated":"2023-11-21T20:59:27Z","published":"2023-11-21T20:59:27Z","title":"FollowMe: a Robust Person Following Framework Based on Re-Identification\n and Gestures","summary":" Human-robot interaction (HRI) has become a crucial enabler in houses and\nindustries for facilitating operational flexibility. When it comes to mobile\ncollaborative robots, this flexibility can be further increased due to the\nautonomous mobility and navigation capacity of the robotic agents, expanding\ntheir workspace and consequently, the personalizable assistance they can\nprovide to the human operators. This however requires that the robot is capable\nof detecting and identifying the human counterpart in all stages of the\ncollaborative task, and in particular while following a human in crowded\nworkplaces. To respond to this need, we developed a unified perception and\nnavigation framework, which enables the robot to identify and follow a target\nperson using a combination of visual Re-Identification (Re-ID), hand gestures\ndetection, and collision-free navigation. The Re-ID module can autonomously\nlearn the features of a target person and use the acquired knowledge to\nvisually re-identify the target. The navigation stack is used to follow the\ntarget avoiding obstacles and other individuals in the environment. Experiments\nare conducted with few subjects in a laboratory setting where some unknown\ndynamic obstacles are introduced.\n","authors":["Federico Rollo","Andrea Zunino","Gennaro Raiola","Fabio Amadio","Arash Ajoudani","Nikolaos Tsagarakis"],"pdf_url":"https://arxiv.org/pdf/2311.12992v1.pdf","comment":"published in \"2023 IEEE International Conference on Advanced Robotics\n and Its Social Impacts (ARSO)\""},{"id":"http://arxiv.org/abs/2311.11819v2","updated":"2023-11-21T20:45:51Z","published":"2023-11-20T14:55:40Z","title":"Generalized super-resolution 4D Flow MRI $\\unicode{x2013}$ using\n ensemble learning to extend across the cardiovascular system","summary":" 4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive\nmeasurement technique capable of quantifying blood flow across the\ncardiovascular system. While practical use is limited by spatial resolution and\nimage noise, incorporation of trained super-resolution (SR) networks has\npotential to enhance image quality post-scan. However, these efforts have\npredominantly been restricted to narrowly defined cardiovascular domains, with\nlimited exploration of how SR performance extends across the cardiovascular\nsystem; a task aggravated by contrasting hemodynamic conditions apparent across\nthe cardiovasculature. The aim of our study was to explore the generalizability\nof SR 4D Flow MRI using a combination of heterogeneous training sets and\ndedicated ensemble learning. With synthetic training data generated across\nthree disparate domains (cardiac, aortic, cerebrovascular), varying\nconvolutional base and ensemble learners were evaluated as a function of domain\nand architecture, quantifying performance on both in-silico and acquired\nin-vivo data from the same three domains. Results show that both bagging and\nstacking ensembling enhance SR performance across domains, accurately\npredicting high-resolution velocities from low-resolution input data in-silico.\nLikewise, optimized networks successfully recover native resolution velocities\nfrom downsampled in-vivo data, as well as show qualitative potential in\ngenerating denoised SR-images from clinical level input data. In conclusion,\nour work presents a viable approach for generalized SR 4D Flow MRI, with\nensemble learning extending utility across various clinical areas of interest.\n","authors":["Leon Ericsson","Adam Hjalmarsson","Muhammad Usman Akbar","Edward Ferdian","Mia Bonini","Brandon Hardy","Jonas Schollenberger","Maria Aristova","Patrick Winter","Nicholas Burris","Alexander Fyrdahl","Andreas Sigfridsson","Susanne Schnell","C. Alberto Figueroa","David Nordsletten","Alistair A. Young","David Marlevi"],"pdf_url":"https://arxiv.org/pdf/2311.11819v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.12981v1","updated":"2023-11-21T20:33:17Z","published":"2023-11-21T20:33:17Z","title":"SD-NAE: Generating Natural Adversarial Examples with Stable Diffusion","summary":" Robustly evaluating deep learning image classifiers is challenging due to\nsome limitations of standard datasets. Natural Adversarial Examples (NAEs),\narising naturally from the environment and capable of deceiving classifiers,\nare instrumental in identifying vulnerabilities in trained models. Existing\nworks collect such NAEs by filtering from a huge set of real images, a process\nthat is passive and lacks control. In this work, we propose to actively\nsynthesize NAEs with the state-of-the-art Stable Diffusion. Specifically, our\nmethod formulates a controlled optimization process, where we perturb the token\nembedding that corresponds to a specified class to synthesize NAEs. The\ngeneration is guided by the gradient of loss from the target classifier so that\nthe created image closely mimics the ground-truth class yet fools the\nclassifier. Named SD-NAE (Stable Diffusion for Natural Adversarial Examples),\nour innovative method is effective in producing valid and useful NAEs, which is\ndemonstrated through a meticulously designed experiment. Our work thereby\nprovides a valuable method for obtaining challenging evaluation data, which in\nturn can potentially advance the development of more robust deep learning\nmodels. Code is available at https://github.com/linyueqian/SD-NAE.\n","authors":["Yueqian Lin","Jingyang Zhang","Yiran Chen","Hai Li"],"pdf_url":"https://arxiv.org/pdf/2311.12981v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12967v1","updated":"2023-11-21T20:12:29Z","published":"2023-11-21T20:12:29Z","title":"Robustifying Generalizable Implicit Shape Networks with a Tunable\n Non-Parametric Model","summary":" Feedforward generalizable models for implicit shape reconstruction from\nunoriented point cloud present multiple advantages, including high performance\nand inference speed. However, they still suffer from generalization issues,\nranging from underfitting the input point cloud, to misrepresenting samples\noutside of the training data distribution, or with toplogies unseen at\ntraining. We propose here an efficient mechanism to remedy some of these\nlimitations at test time. We combine the inter-shape data prior of the network\nwith an intra-shape regularization prior of a Nystr\\\"om Kernel Ridge\nRegression, that we further adapt by fitting its hyperprameters to the current\nshape. The resulting shape function defined in a shape specific Reproducing\nKernel Hilbert Space benefits from desirable stability and efficiency\nproperties and grants a shape adaptive expressiveness-robustness trade-off. We\ndemonstrate the improvement obtained through our method with respect to\nbaselines and the state-of-the-art using synthetic and real data.\n","authors":["Amine Ouasfi","Adnane Boukhayma"],"pdf_url":"https://arxiv.org/pdf/2311.12967v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.12956v1","updated":"2023-11-21T19:49:13Z","published":"2023-11-21T19:49:13Z","title":"Innovative Horizons in Aerial Imagery: LSKNet Meets DiffusionDet for\n Advanced Object Detection","summary":" In the realm of aerial image analysis, object detection plays a pivotal role,\nwith significant implications for areas such as remote sensing, urban planning,\nand disaster management. This study addresses the inherent challenges in this\ndomain, notably the detection of small objects, managing densely packed\nelements, and accounting for diverse orientations. We present an in-depth\nevaluation of an object detection model that integrates the Large Selective\nKernel Network (LSKNet)as its backbone with the DiffusionDet head, utilizing\nthe iSAID dataset for empirical analysis. Our approach encompasses the\nintroduction of novel methodologies and extensive ablation studies. These\nstudies critically assess various aspects such as loss functions, box\nregression techniques, and classification strategies to refine the model's\nprecision in object detection. The paper details the experimental application\nof the LSKNet backbone in synergy with the DiffusionDet heads, a combination\ntailored to meet the specific challenges in aerial image object detection. The\nfindings of this research indicate a substantial enhancement in the model's\nperformance, especially in the accuracy-time tradeoff. The proposed model\nachieves a mean average precision (MAP) of approximately 45.7%, which is a\nsignificant improvement, outperforming the RCNN model by 4.7% on the same\ndataset. This advancement underscores the effectiveness of the proposed\nmodifications and sets a new benchmark in aerial image analysis, paving the way\nfor more accurate and efficient object detection methodologies. The code is\npublicly available at https://github.com/SashaMatsun/LSKDiffDet\n","authors":["Ahmed Sharshar","Aleksandr Matsun"],"pdf_url":"https://arxiv.org/pdf/2311.12956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.05705v2","updated":"2023-11-21T19:24:43Z","published":"2023-06-09T06:54:58Z","title":"On the Challenges and Perspectives of Foundation Models for Medical\n Image Analysis","summary":" This article discusses the opportunities, applications and future directions\nof large-scale pre-trained models, i.e., foundation models, for analyzing\nmedical images. Medical foundation models have immense potential in solving a\nwide range of downstream tasks, as they can help to accelerate the development\nof accurate and robust models, reduce the large amounts of required labeled\ndata, preserve the privacy and confidentiality of patient data. Specifically,\nwe illustrate the \"spectrum\" of medical foundation models, ranging from general\nvision models, modality-specific models, to organ/task-specific models,\nhighlighting their challenges, opportunities and applications. We also discuss\nhow foundation models can be leveraged in downstream medical tasks to enhance\nthe accuracy and efficiency of medical image analysis, leading to more precise\ndiagnosis and treatment decisions.\n","authors":["Shaoting Zhang","Dimitris Metaxas"],"pdf_url":"https://arxiv.org/pdf/2306.05705v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12919v1","updated":"2023-11-21T18:43:07Z","published":"2023-11-21T18:43:07Z","title":"SPOT! Revisiting Video-Language Models for Event Understanding","summary":" Understanding videos is an important research topic for multimodal learning.\nLeveraging large-scale datasets of web-crawled video-text pairs as weak\nsupervision has become a pre-training paradigm for learning joint\nrepresentations and showcased remarkable potential in video understanding\ntasks. However, videos can be multi-event and multi-grained, while these\nvideo-text pairs usually contain only broad-level video captions. This raises a\nquestion: with such weak supervision, can video representation in\nvideo-language models gain the ability to distinguish even factual\ndiscrepancies in textual description and understand fine-grained events? To\naddress this, we introduce SPOT Prober, to benchmark existing video-language\nmodels's capacities of distinguishing event-level discrepancies as an indicator\nof models' event understanding ability. Our approach involves extracting events\nas tuples () from videos and\ngenerating false event tuples by manipulating tuple components systematically.\nWe reevaluate the existing video-language models with these positive and\nnegative captions and find they fail to distinguish most of the manipulated\nevents. Based on our findings, we propose to plug in these manipulated event\ncaptions as hard negative samples and find them effective in enhancing models\nfor event understanding.\n","authors":["Gengyuan Zhang","Jinhe Bi","Jindong Gu","Volker Tresp"],"pdf_url":"https://arxiv.org/pdf/2311.12919v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2311.12914v1","updated":"2023-11-21T17:55:46Z","published":"2023-11-21T17:55:46Z","title":"Attention Deficit is Ordered! Fooling Deformable Vision Transformers\n with Collaborative Adversarial Patches","summary":" The latest generation of transformer-based vision models have proven to be\nsuperior to Convolutional Neural Network (CNN)-based models across several\nvision tasks, largely attributed to their remarkable prowess in relation\nmodeling. Deformable vision transformers significantly reduce the quadratic\ncomplexity of modeling attention by using sparse attention structures, enabling\nthem to be used in larger scale applications such as multi-view vision systems.\nRecent work demonstrated adversarial attacks against transformers; we show that\nthese attacks do not transfer to deformable transformers due to their sparse\nattention structure. Specifically, attention in deformable transformers is\nmodeled using pointers to the most relevant other tokens. In this work, we\ncontribute for the first time adversarial attacks that manipulate the attention\nof deformable transformers, distracting them to focus on irrelevant parts of\nthe image. We also develop new collaborative attacks where a source patch\nmanipulates attention to point to a target patch that adversarially attacks the\nsystem. In our experiments, we find that only 1% patched area of the input\nfield can lead to 0% AP. We also show that the attacks provide substantial\nversatility to support different attacker scenarios because of their ability to\nredirect attention under the attacker control.\n","authors":["Quazi Mishkatul Alam","Bilel Tarchoun","Ihsen Alouani","Nael Abu-Ghazaleh"],"pdf_url":"https://arxiv.org/pdf/2311.12914v1.pdf","comment":"9 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.12912v1","updated":"2023-11-21T17:27:20Z","published":"2023-11-21T17:27:20Z","title":"Q-Seg: Quantum Annealing-based Unsupervised Image Segmentation","summary":" In this study, we present Q-Seg, a novel unsupervised image segmentation\nmethod based on quantum annealing, tailored for existing quantum hardware. We\nformulate the pixel-wise segmentation problem, which assimilates spectral and\nspatial information of the image, as a graph-cut optimization task. Our method\nefficiently leverages the interconnected qubit topology of the D-Wave Advantage\ndevice, offering superior scalability over existing quantum approaches and\noutperforming state-of-the-art classical methods. Our empirical evaluations on\nsynthetic datasets reveal that Q-Seg offers better runtime performance against\nthe classical optimizer Gurobi. Furthermore, we evaluate our method on\nsegmentation of Earth Observation images, an area of application where the\namount of labeled data is usually very limited. In this case, Q-Seg\ndemonstrates near-optimal results in flood mapping detection with respect to\nclassical supervised state-of-the-art machine learning methods. Also, Q-Seg\nprovides enhanced segmentation for forest coverage compared to existing\nannotated masks. Thus, Q-Seg emerges as a viable alternative for real-world\napplications using available quantum hardware, particularly in scenarios where\nthe lack of labeled data and computational runtime are critical.\n","authors":["Supreeth Mysore Venkatesh","Antonio Macaluso","Marlon Nuske","Matthias Klusch","Andreas Dengel"],"pdf_url":"https://arxiv.org/pdf/2311.12912v1.pdf","comment":"12 pages, 9 figures, 1 table"},{"id":"http://arxiv.org/abs/2311.12908v1","updated":"2023-11-21T15:24:05Z","published":"2023-11-21T15:24:05Z","title":"Diffusion Model Alignment Using Direct Preference Optimization","summary":" Large language models (LLMs) are fine-tuned using human comparison data with\nReinforcement Learning from Human Feedback (RLHF) methods to make them better\naligned with users' preferences. In contrast to LLMs, human preference learning\nhas not been widely explored in text-to-image diffusion models; the best\nexisting approach is to fine-tune a pretrained model using carefully curated\nhigh quality images and captions to improve visual appeal and text alignment.\nWe propose Diffusion-DPO, a method to align diffusion models to human\npreferences by directly optimizing on human comparison data. Diffusion-DPO is\nadapted from the recently developed Direct Preference Optimization (DPO), a\nsimpler alternative to RLHF which directly optimizes a policy that best\nsatisfies human preferences under a classification objective. We re-formulate\nDPO to account for a diffusion model notion of likelihood, utilizing the\nevidence lower bound to derive a differentiable objective. Using the Pick-a-Pic\ndataset of 851K crowdsourced pairwise preferences, we fine-tune the base model\nof the state-of-the-art Stable Diffusion XL (SDXL)-1.0 model with\nDiffusion-DPO. Our fine-tuned base model significantly outperforms both base\nSDXL-1.0 and the larger SDXL-1.0 model consisting of an additional refinement\nmodel in human evaluation, improving visual appeal and prompt alignment. We\nalso develop a variant that uses AI feedback and has comparable performance to\ntraining on human preferences, opening the door for scaling of diffusion model\nalignment methods.\n","authors":["Bram Wallace","Meihua Dang","Rafael Rafailov","Linqi Zhou","Aaron Lou","Senthil Purushwalkam","Stefano Ermon","Caiming Xiong","Shafiq Joty","Nikhil Naik"],"pdf_url":"https://arxiv.org/pdf/2311.12908v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2205.10852v6","updated":"2023-11-21T16:36:43Z","published":"2022-05-22T15:30:18Z","title":"Relphormer: Relational Graph Transformer for Knowledge Graph\n Representations","summary":" Transformers have achieved remarkable performance in widespread fields,\nincluding natural language processing, computer vision and graph mining.\nHowever, vanilla Transformer architectures have not yielded promising\nimprovements in the Knowledge Graph (KG) representations, where the\ntranslational distance paradigm dominates this area. Note that vanilla\nTransformer architectures struggle to capture the intrinsically heterogeneous\nstructural and semantic information of knowledge graphs. To this end, we\npropose a new variant of Transformer for knowledge graph representations dubbed\nRelphormer. Specifically, we introduce Triple2Seq which can dynamically sample\ncontextualized sub-graph sequences as the input to alleviate the heterogeneity\nissue. We propose a novel structure-enhanced self-attention mechanism to encode\nthe relational information and keep the semantic information within entities\nand relations. Moreover, we utilize masked knowledge modeling for general\nknowledge graph representation learning, which can be applied to various\nKG-based tasks including knowledge graph completion, question answering, and\nrecommendation. Experimental results on six datasets show that Relphormer can\nobtain better performance compared with baselines. Code is available in\nhttps://github.com/zjunlp/Relphormer.\n","authors":["Zhen Bi","Siyuan Cheng","Jing Chen","Xiaozhuan Liang","Feiyu Xiong","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.10852v6.pdf","comment":"Neurocomputing 2023"},{"id":"http://arxiv.org/abs/2311.12474v1","updated":"2023-11-21T09:36:11Z","published":"2023-11-21T09:36:11Z","title":"CSMeD: Bridging the Dataset Gap in Automated Citation Screening for\n Systematic Literature Reviews","summary":" Systematic literature reviews (SLRs) play an essential role in summarising,\nsynthesising and validating scientific evidence. In recent years, there has\nbeen a growing interest in using machine learning techniques to automate the\nidentification of relevant studies for SLRs. However, the lack of standardised\nevaluation datasets makes comparing the performance of such automated\nliterature screening systems difficult. In this paper, we analyse the citation\nscreening evaluation datasets, revealing that many of the available datasets\nare either too small, suffer from data leakage or have limited applicability to\nsystems treating automated literature screening as a classification task, as\nopposed to, for example, a retrieval or question-answering task. To address\nthese challenges, we introduce CSMeD, a meta-dataset consolidating nine\npublicly released collections, providing unified access to 325 SLRs from the\nfields of medicine and computer science. CSMeD serves as a comprehensive\nresource for training and evaluating the performance of automated citation\nscreening models. Additionally, we introduce CSMeD-FT, a new dataset designed\nexplicitly for evaluating the full text publication screening task. To\ndemonstrate the utility of CSMeD, we conduct experiments and establish\nbaselines on new datasets.\n","authors":["Wojciech Kusa","Oscar E. Mendoza","Matthias Samwald","Petr Knoth","Allan Hanbury"],"pdf_url":"https://arxiv.org/pdf/2311.12474v1.pdf","comment":"Accepted at NeurIPS 2023 Datasets and Benchmarks Track"},{"id":"http://arxiv.org/abs/2311.12404v1","updated":"2023-11-21T07:43:50Z","published":"2023-11-21T07:43:50Z","title":"InterPrompt: Interpretable Prompting for Interrelated Interpersonal Risk\n Factors in Reddit Posts","summary":" Mental health professionals and clinicians have observed the upsurge of\nmental disorders due to Interpersonal Risk Factors (IRFs). To simulate the\nhuman-in-the-loop triaging scenario for early detection of mental health\ndisorders, we recognized textual indications to ascertain these IRFs : Thwarted\nBelongingness (TBe) and Perceived Burdensomeness (PBu) within personal\nnarratives. In light of this, we use N-shot learning with GPT-3 model on the\nIRF dataset, and underscored the importance of fine-tuning GPT-3 model to\nincorporate the context-specific sensitivity and the interconnectedness of\ntextual cues that represent both IRFs.\n In this paper, we introduce an Interpretable Prompting (InterPrompt)} method\nto boost the attention mechanism by fine-tuning the GPT-3 model. This allows a\nmore sophisticated level of language modification by adjusting the pre-trained\nweights. Our model learns to detect usual patterns and underlying connections\nacross both the IRFs, which leads to better system-level explainability and\ntrustworthiness. The results of our research demonstrate that all four variants\nof GPT-3 model, when fine-tuned with InterPrompt, perform considerably better\nas compared to the baseline methods, both in terms of classification and\nexplanation generation.\n","authors":["MSVPJ Sathvik","Surjodeep Sarkar","Chandni Saxena","Sunghwan Sohn","Muskan Garg"],"pdf_url":"https://arxiv.org/pdf/2311.12404v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2311.12389v1","updated":"2023-11-21T07:01:05Z","published":"2023-11-21T07:01:05Z","title":"Linear-time online visibility graph transformation algorithm: for both\n natural and horizontal visibility criteria","summary":" Visibility graph (VG) transformation is a technique used to convert a time\nseries into a graph based on specific visibility criteria. It has attracted\nincreasing interest in the fields of time series analysis, forecasting, and\nclassification. Optimizing the VG transformation algorithm to accelerate the\nprocess is a critical aspect of VG-related research, as it enhances the\napplicability of VG transformation in latency-sensitive areas and conserves\ncomputational resources. In the real world, many time series are presented in\nthe form of data streams. Despite the proposal of the concept of VG's online\nfunctionality, previous studies have not thoroughly explored the acceleration\nof VG transformation by leveraging the characteristics of data streams. In this\npaper, we propose that an efficient online VG algorithm should adhere to two\ncriteria and develop a linear-time method, termed the LOT framework, for both\nnatural and horizontal visibility graph transformations in data stream\nscenarios. Experiments are conducted on two datasets, comparing our approach\nwith five existing methods as baselines. The results demonstrate the validity\nand promising computational efficiency of our framework.\n","authors":["Yusheng Huang","Yong Deng"],"pdf_url":"https://arxiv.org/pdf/2311.12389v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14884v2","updated":"2023-11-21T05:28:30Z","published":"2023-10-23T12:53:22Z","title":"Budgeted Embedding Table For Recommender Systems","summary":" At the heart of contemporary recommender systems (RSs) are latent factor\nmodels that provide quality recommendation experience to users. These models\nuse embedding vectors, which are typically of a uniform and fixed size, to\nrepresent users and items. As the number of users and items continues to grow,\nthis design becomes inefficient and hard to scale. Recent lightweight embedding\nmethods have enabled different users and items to have diverse embedding sizes,\nbut are commonly subject to two major drawbacks. Firstly, they limit the\nembedding size search to optimizing a heuristic balancing the recommendation\nquality and the memory complexity, where the trade-off coefficient needs to be\nmanually tuned for every memory budget requested. The implicitly enforced\nmemory complexity term can even fail to cap the parameter usage, making the\nresultant embedding table fail to meet the memory budget strictly. Secondly,\nmost solutions, especially reinforcement learning based ones derive and\noptimize the embedding size for each each user/item on an instance-by-instance\nbasis, which impedes the search efficiency. In this paper, we propose Budgeted\nEmbedding Table (BET), a novel method that generates table-level actions (i.e.,\nembedding sizes for all users and items) that is guaranteed to meet\npre-specified memory budgets. Furthermore, by leveraging a set-based action\nformulation and engaging set representation learning, we present an innovative\naction search strategy powered by an action fitness predictor that efficiently\nevaluates each table-level action. Experiments have shown state-of-the-art\nperformance on two real-world datasets when BET is paired with three popular\nrecommender models under different memory budgets.\n","authors":["Yunke Qu","Tong Chen","Quoc Viet Hung Nguyen","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2310.14884v2.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2311.12355v1","updated":"2023-11-21T05:15:56Z","published":"2023-11-21T05:15:56Z","title":"Utilizing Language Models for Tour Itinerary Recommendation","summary":" Tour itinerary recommendation involves planning a sequence of relevant\nPoint-of-Interest (POIs), which combines challenges from the fields of both\nOperations Research (OR) and Recommendation Systems (RS). As an OR problem,\nthere is the need to maximize a certain utility (e.g., popularity of POIs in\nthe tour) while adhering to some constraints (e.g., maximum time for the tour).\nAs a RS problem, it is heavily related to problem or filtering or ranking a\nsubset of POIs that are relevant to a user and recommending it as part of an\nitinerary. In this paper, we explore the use of language models for the task of\ntour itinerary recommendation and planning. This task has the unique\nrequirement of recommending personalized POIs relevant to users and planning\nthese POIs as an itinerary that satisfies various constraints. We discuss some\napproaches in this area, such as using word embedding techniques like Word2Vec\nand GloVe for learning POI embeddings and transformer-based techniques like\nBERT for generating\n itineraries.\n","authors":["Ngai Lam Ho","Kwan Hui Lim"],"pdf_url":"https://arxiv.org/pdf/2311.12355v1.pdf","comment":"PMAI23 @IJCAI 2023 2nd International Workshop on Process Management\n in the AI era"},{"id":"http://arxiv.org/abs/2304.07763v5","updated":"2023-11-21T05:03:06Z","published":"2023-04-16T12:30:33Z","title":"Meta-optimized Contrastive Learning for Sequential Recommendation","summary":" Contrastive Learning (CL) performances as a rising approach to address the\nchallenge of sparse and noisy recommendation data. Although having achieved\npromising results, most existing CL methods only perform either hand-crafted\ndata or model augmentation for generating contrastive pairs to find a proper\naugmentation operation for different datasets, which makes the model hard to\ngeneralize. Additionally, since insufficient input data may lead the encoder to\nlearn collapsed embeddings, these CL methods expect a relatively large number\nof training data (e.g., large batch size or memory bank) to contrast. However,\nnot all contrastive pairs are always informative and discriminative enough for\nthe training processing. Therefore, a more general CL-based recommendation\nmodel called Meta-optimized Contrastive Learning for sequential Recommendation\n(MCLRec) is proposed in this work. By applying both data augmentation and\nlearnable model augmentation operations, this work innovates the standard CL\nframework by contrasting data and model augmented views for adaptively\ncapturing the informative features hidden in stochastic data augmentation.\nMoreover, MCLRec utilizes a meta-learning manner to guide the updating of the\nmodel augmenters, which helps to improve the quality of contrastive pairs\nwithout enlarging the amount of input data. Finally, a contrastive\nregularization term is considered to encourage the augmentation model to\ngenerate more informative augmented views and avoid too similar contrastive\npairs within the meta updating. The experimental results on commonly used\ndatasets validate the effectiveness of MCLRec.\n","authors":["Xiuyuan Qin","Huanhuan Yuan","Pengpeng Zhao","Junhua Fang","Fuzhen Zhuang","Guanfeng Liu","Victor Sheng"],"pdf_url":"https://arxiv.org/pdf/2304.07763v5.pdf","comment":"11 Pages,8 figures,SIGIR2023"},{"id":"http://arxiv.org/abs/2311.12338v1","updated":"2023-11-21T04:14:09Z","published":"2023-11-21T04:14:09Z","title":"A Survey on Large Language Models for Personalized and Explainable\n Recommendations","summary":" In recent years, Recommender Systems(RS) have witnessed a transformative\nshift with the advent of Large Language Models(LLMs) in the field of Natural\nLanguage Processing(NLP). These models such as OpenAI's GPT-3.5/4, Llama from\nMeta, have demonstrated unprecedented capabilities in understanding and\ngenerating human-like text. This has led to a paradigm shift in the realm of\npersonalized and explainable recommendations, as LLMs offer a versatile toolset\nfor processing vast amounts of textual data to enhance user experiences. To\nprovide a comprehensive understanding of the existing LLM-based recommendation\nsystems, this survey aims to analyze how RS can benefit from LLM-based\nmethodologies. Furthermore, we describe major challenges in Personalized\nExplanation Generating(PEG) tasks, which are cold-start problems, unfairness\nand bias problems in RS.\n","authors":["Junyi Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12338v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12329v1","updated":"2023-11-21T03:42:15Z","published":"2023-11-21T03:42:15Z","title":"Graph Neural Ordinary Differential Equations-based method for\n Collaborative Filtering","summary":" Graph Convolution Networks (GCNs) are widely considered state-of-the-art for\ncollaborative filtering. Although several GCN-based methods have been proposed\nand achieved state-of-the-art performance in various tasks, they can be\ncomputationally expensive and time-consuming to train if too many layers are\ncreated. However, since the linear GCN model can be interpreted as a\ndifferential equation, it is possible to transfer it to an ODE problem. This\ninspired us to address the computational limitations of GCN-based models by\ndesigning a simple and efficient NODE-based model that can skip some GCN layers\nto reach the final state, thus avoiding the need to create many layers. In this\nwork, we propose a Graph Neural Ordinary Differential Equation-based method for\nCollaborative Filtering (GODE-CF). This method estimates the final embedding by\nutilizing the information captured by one or two GCN layers. To validate our\napproach, we conducted experiments on multiple datasets. The results\ndemonstrate that our model outperforms competitive baselines, including\nGCN-based models and other state-of-the-art CF methods. Notably, our proposed\nGODE-CF model has several advantages over traditional GCN-based models. It is\nsimple, efficient, and has a fast training time, making it a practical choice\nfor real-world situations.\n","authors":["Ke Xu","Yuanjie Zhu","Weizhi Zhang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12329v1.pdf","comment":"Accepted by ICDM 2023"},{"id":"http://arxiv.org/abs/2311.03488v3","updated":"2023-11-21T03:08:37Z","published":"2023-11-06T19:52:55Z","title":"Multi-Resolution Diffusion for Privacy-Sensitive Recommender Systems","summary":" While recommender systems have become an integral component of the Web\nexperience, their heavy reliance on user data raises privacy and security\nconcerns. Substituting user data with synthetic data can address these\nconcerns, but accurately replicating these real-world datasets has been a\nnotoriously challenging problem. Recent advancements in generative AI have\ndemonstrated the impressive capabilities of diffusion models in generating\nrealistic data across various domains. In this work we introduce a Score-based\nDiffusion Recommendation Module (SDRM), which captures the intricate patterns\nof real-world datasets required for training highly accurate recommender\nsystems. SDRM allows for the generation of synthetic data that can replace\nexisting datasets to preserve user privacy, or augment existing datasets to\naddress excessive data sparsity. Our method outperforms competing baselines\nsuch as generative adversarial networks, variational autoencoders, and recently\nproposed diffusion models in synthesizing various datasets to replace or\naugment the original data by an average improvement of 4.30% in Recall@$k$ and\n4.65% in NDCG@$k$.\n","authors":["Derek Lilienthal","Paul Mello","Magdalini Eirinaki","Stas Tiomkin"],"pdf_url":"https://arxiv.org/pdf/2311.03488v3.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.12287v1","updated":"2023-11-21T02:01:01Z","published":"2023-11-21T02:01:01Z","title":"Adapting LLMs for Efficient, Personalized Information Retrieval: Methods\n and Implications","summary":" The advent of Large Language Models (LLMs) heralds a pivotal shift in online\nuser interactions with information. Traditional Information Retrieval (IR)\nsystems primarily relied on query-document matching, whereas LLMs excel in\ncomprehending and generating human-like text, thereby enriching the IR\nexperience significantly. While LLMs are often associated with chatbot\nfunctionalities, this paper extends the discussion to their explicit\napplication in information retrieval. We explore methodologies to optimize the\nretrieval process, select optimal models, and effectively scale and orchestrate\nLLMs, aiming for cost-efficiency and enhanced result accuracy. A notable\nchallenge, model hallucination-where the model yields inaccurate or\nmisinterpreted data-is addressed alongside other model-specific hurdles. Our\ndiscourse extends to crucial considerations including user privacy, data\noptimization, and the necessity for system clarity and interpretability.\nThrough a comprehensive examination, we unveil not only innovative strategies\nfor integrating Language Models (LLMs) with Information Retrieval (IR) systems,\nbut also the consequential considerations that underline the need for a\nbalanced approach aligned with user-centric principles.\n","authors":["Samira Ghodratnama","Mehrdad Zakershahrak"],"pdf_url":"https://arxiv.org/pdf/2311.12287v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12257v1","updated":"2023-11-21T00:37:47Z","published":"2023-11-21T00:37:47Z","title":"Equipping Pretrained Unconditional Music Transformers with Instrument\n and Genre Controls","summary":" The ''pretraining-and-finetuning'' paradigm has become a norm for training\ndomain-specific models in natural language processing and computer vision. In\nthis work, we aim to examine this paradigm for symbolic music generation\nthrough leveraging the largest ever symbolic music dataset sourced from the\nMuseScore forum. We first pretrain a large unconditional transformer model\nusing 1.5 million songs. We then propose a simple technique to equip this\npretrained unconditional music transformer model with instrument and genre\ncontrols by finetuning the model with additional control tokens. Our proposed\nrepresentation offers improved high-level controllability and expressiveness\nagainst two existing representations. The experimental results show that the\nproposed model can successfully generate music with user-specified instruments\nand genre. In a subjective listening test, the proposed model outperforms the\npretrained baseline model in terms of coherence, harmony, arrangement and\noverall quality.\n","authors":["Weihan Xu","Julian McAuley","Shlomo Dubnov","Hao-Wen Dong"],"pdf_url":"https://arxiv.org/pdf/2311.12257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08350v2","updated":"2023-11-21T22:09:27Z","published":"2023-11-14T17:48:27Z","title":"ChoralSynth: Synthetic Dataset of Choral Singing","summary":" Choral singing, a widely practiced form of ensemble singing, lacks\ncomprehensive datasets in the realm of Music Information Retrieval (MIR)\nresearch, due to challenges arising from the requirement to curate multitrack\nrecordings. To address this, we devised a novel methodology, leveraging\nstate-of-the-art synthesizers to create and curate quality renditions. The\nscores were sourced from Choral Public Domain Library(CPDL). This work is done\nin collaboration with a diverse team of musicians, software engineers and\nresearchers. The resulting dataset, complete with its associated metadata, and\nmethodology is released as part of this work, opening up new avenues for\nexploration and advancement in the field of singing voice research.\n","authors":["Jyoti Narang","Viviana De La Vega","Xavier Lizarraga","Oscar Mayor","Hector Parra","Jordi Janer","Xavier Serra"],"pdf_url":"https://arxiv.org/pdf/2311.08350v2.pdf","comment":"Dataset Link: https://doi.org/10.5281/zenodo.10137883"},{"id":"http://arxiv.org/abs/2311.12955v1","updated":"2023-11-21T19:41:46Z","published":"2023-11-21T19:41:46Z","title":"Don't forget private retrieval: distributed private similarity search\n for large language models","summary":" While the flexible capabilities of large language models (LLMs) allow them to\nanswer a range of queries based on existing learned knowledge, information\nretrieval to augment generation is an important tool to allow LLMs to answer\nquestions on information not included in pre-training data. Such private\ninformation is increasingly being generated in a wide array of distributed\ncontexts by organizations and individuals. Performing such information\nretrieval using neural embeddings of queries and documents always leaked\ninformation about queries and database content unless both were stored locally.\nWe present Private Retrieval Augmented Generation (PRAG), an approach that uses\nmulti-party computation (MPC) to securely transmit queries to a distributed set\nof servers containing a privately constructed database to return top-k and\napproximate top-k documents. This is a first-of-its-kind approach to dense\ninformation retrieval that ensures no server observes a client's query or can\nsee the database content. The approach introduces a novel MPC friendly protocol\nfor inverted file approximate search (IVF) that allows for fast document search\nover distributed and private data in sublinear communication complexity. This\nwork presents new avenues through which data for use in LLMs can be accessed\nand used without needing to centralize or forgo privacy.\n","authors":["Guy Zyskind","Tobin South","Alex Pentland"],"pdf_url":"https://arxiv.org/pdf/2311.12955v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12894v1","updated":"2023-11-21T08:20:38Z","published":"2023-11-21T08:20:38Z","title":"Attribute-Aware Deep Hashing with Self-Consistency for Large-Scale\n Fine-Grained Image Retrieval","summary":" Our work focuses on tackling large-scale fine-grained image retrieval as\nranking the images depicting the concept of interests (i.e., the same\nsub-category labels) highest based on the fine-grained details in the query. It\nis desirable to alleviate the challenges of both fine-grained nature of small\ninter-class variations with large intra-class variations and explosive growth\nof fine-grained data for such a practical task. In this paper, we propose\nattribute-aware hashing networks with self-consistency for generating\nattribute-aware hash codes to not only make the retrieval process efficient,\nbut also establish explicit correspondences between hash codes and visual\nattributes. Specifically, based on the captured visual representations by\nattention, we develop an encoder-decoder structure network of a reconstruction\ntask to unsupervisedly distill high-level attribute-specific vectors from the\nappearance-specific visual representations without attribute annotations. Our\nmodels are also equipped with a feature decorrelation constraint upon these\nattribute vectors to strengthen their representative abilities. Then, driven by\npreserving original entities' similarity, the required hash codes can be\ngenerated from these attribute-specific vectors and thus become\nattribute-aware. Furthermore, to combat simplicity bias in deep hashing, we\nconsider the model design from the perspective of the self-consistency\nprinciple and propose to further enhance models' self-consistency by equipping\nan additional image reconstruction path. Comprehensive quantitative experiments\nunder diverse empirical settings on six fine-grained retrieval datasets and two\ngeneric retrieval datasets show the superiority of our models over competing\nmethods.\n","authors":["Xiu-Shen Wei","Yang Shen","Xuhao Sun","Peng Wang","Yuxin Peng"],"pdf_url":"https://arxiv.org/pdf/2311.12894v1.pdf","comment":"Accepted by IEEE TPAMI"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.12796v1","updated":"2023-11-21T18:59:58Z","published":"2023-11-21T18:59:58Z","title":"Physics-guided Shape-from-Template: Monocular Video Perception through\n Neural Surrogate Models","summary":" 3D reconstruction of dynamic scenes is a long-standing problem in computer\ngraphics and increasingly difficult the less information is available.\nShape-from-Template (SfT) methods aim to reconstruct a template-based geometry\nfrom RGB images or video sequences, often leveraging just a single monocular\ncamera without depth information, such as regular smartphone recordings.\nUnfortunately, existing reconstruction methods are either unphysical and noisy\nor slow in optimization. To solve this problem, we propose a novel SfT\nreconstruction algorithm for cloth using a pre-trained neural surrogate model\nthat is fast to evaluate, stable, and produces smooth reconstructions due to a\nregularizing physics simulation. Differentiable rendering of the simulated mesh\nenables pixel-wise comparisons between the reconstruction and a target video\nsequence that can be used for a gradient-based optimization procedure to\nextract not only shape information but also physical parameters such as\nstretching, shearing, or bending stiffness of the cloth. This allows to retain\na precise, stable, and smooth reconstructed geometry while reducing the runtime\nby a factor of 400-500 compared to $\\phi$-SfT, a state-of-the-art physics-based\nSfT approach.\n","authors":["David Stotko","Nils Wandel","Reinhard Klein"],"pdf_url":"https://arxiv.org/pdf/2311.12796v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.10793v5","updated":"2023-11-21T18:51:56Z","published":"2022-02-22T10:25:59Z","title":"PyTorch Geometric Signed Directed: A Software Package on Graph Neural\n Networks for Signed and Directed Graphs","summary":" Networks are ubiquitous in many real-world applications (e.g., social\nnetworks encoding trust/distrust relationships, correlation networks arising\nfrom time series data). While many networks are signed or directed, or both,\nthere is a lack of unified software packages on graph neural networks (GNNs)\nspecially designed for signed and directed networks. In this paper, we present\nPyTorch Geometric Signed Directed (PyGSD), a software package which fills this\ngap. Along the way, we evaluate the implemented methods with experiments with a\nview to providing insights into which method to choose for a given task. The\ndeep learning framework consists of easy-to-use GNN models, synthetic and\nreal-world data, as well as task-specific evaluation metrics and loss functions\nfor signed and directed networks. As an extension library for PyG, our proposed\nsoftware is maintained with open-source releases, detailed documentation,\ncontinuous integration, unit tests and code coverage checks. The GitHub\nrepository of the library is\nhttps://github.com/SherylHYX/pytorch_geometric_signed_directed.\n","authors":["Yixuan He","Xitong Zhang","Junjie Huang","Benedek Rozemberczki","Mihai Cucuringu","Gesine Reinert"],"pdf_url":"https://arxiv.org/pdf/2202.10793v5.pdf","comment":"Accepted by LoG 2023. 27 pages in total"},{"id":"http://arxiv.org/abs/2311.12786v1","updated":"2023-11-21T18:51:04Z","published":"2023-11-21T18:51:04Z","title":"Mechanistically analyzing the effects of fine-tuning on procedurally\n defined tasks","summary":" Fine-tuning large pre-trained models has become the de facto strategy for\ndeveloping both task-specific and general-purpose machine learning systems,\nincluding developing models that are safe to deploy. Despite its clear\nimportance, there has been minimal work that explains how fine-tuning alters\nthe underlying capabilities learned by a model during pretraining: does\nfine-tuning yield entirely novel capabilities or does it just modulate existing\nones? We address this question empirically in synthetic, controlled settings\nwhere we can use mechanistic interpretability tools (e.g., network pruning and\nprobing) to understand how the model's underlying capabilities are changing. We\nperform an extensive analysis of the effects of fine-tuning in these settings,\nand show that: (i) fine-tuning rarely alters the underlying model capabilities;\n(ii) a minimal transformation, which we call a 'wrapper', is typically learned\non top of the underlying model capabilities, creating the illusion that they\nhave been modified; and (iii) further fine-tuning on a task where such hidden\ncapabilities are relevant leads to sample-efficient 'revival' of the\ncapability, i.e., the model begins reusing these capability after only a few\ngradient steps. This indicates that practitioners can unintentionally remove a\nmodel's safety wrapper merely by fine-tuning it on a, e.g., superficially\nunrelated, downstream task. We additionally perform analysis on language models\ntrained on the TinyStories dataset to support our claims in a more realistic\nsetup.\n","authors":["Samyak Jain","Robert Kirk","Ekdeep Singh Lubana","Robert P. Dick","Hidenori Tanaka","Edward Grefenstette","Tim Rocktäschel","David Scott Krueger"],"pdf_url":"https://arxiv.org/pdf/2311.12786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01331v2","updated":"2023-11-21T18:50:49Z","published":"2023-11-02T15:41:57Z","title":"Offline Imitation from Observation via Primal Wasserstein State\n Occupancy Matching","summary":" In real-world scenarios, arbitrary interactions with the environment can\noften be costly, and actions of expert demonstrations are not always available.\nTo reduce the need for both, Offline Learning from Observations (LfO) is\nextensively studied, where the agent learns to solve a task with only expert\nstates and \\textit{task-agnostic} non-expert state-action pairs. The\nstate-of-the-art DIstribution Correction Estimation (DICE) methods minimize the\nstate occupancy divergence between the learner and expert policies. However,\nthey are limited to either $f$-divergences (KL and $\\chi^2$) or Wasserstein\ndistance with Rubinstein duality, the latter of which constrains the underlying\ndistance metric crucial to the performance of Wasserstein-based solutions. To\naddress this problem, we propose Primal Wasserstein DICE (PW-DICE), which\nminimizes the primal Wasserstein distance between the expert and learner state\noccupancies with a pessimistic regularizer and leverages a contrastively\nlearned distance as the underlying metric for the Wasserstein distance.\nTheoretically, we prove that our framework is a generalization of the\nstate-of-the-art, SMODICE, and unifies $f$-divergence and Wasserstein\nminimization. Empirically, we find that PW-DICE improves upon several\nstate-of-the-art methods on multiple testbeds.\n","authors":["Kai Yan","Alexander G. Schwing","Yu-xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.01331v2.pdf","comment":"23 pages. Accepted to the Optimal Transport and Machine Learning\n Workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.12784v1","updated":"2023-11-21T18:50:38Z","published":"2023-11-21T18:50:38Z","title":"Optimality in Mean Estimation: Beyond Worst-Case, Beyond Sub-Gaussian,\n and Beyond $1+α$ Moments","summary":" There is growing interest in improving our algorithmic understanding of\nfundamental statistical problems such as mean estimation, driven by the goal of\nunderstanding the limits of what we can extract from valuable data. The state\nof the art results for mean estimation in $\\mathbb{R}$ are 1) the optimal\nsub-Gaussian mean estimator by [LV22], with the tight sub-Gaussian constant for\nall distributions with finite but unknown variance, and 2) the analysis of the\nmedian-of-means algorithm by [BCL13] and a lower bound by [DLLO16],\ncharacterizing the big-O optimal errors for distributions for which only a\n$1+\\alpha$ moment exists for $\\alpha \\in (0,1)$. Both results, however, are\noptimal only in the worst case. We initiate the fine-grained study of the mean\nestimation problem: Can algorithms leverage useful features of the input\ndistribution to beat the sub-Gaussian rate, without explicit knowledge of such\nfeatures?\n We resolve this question with an unexpectedly nuanced answer: \"Yes in limited\nregimes, but in general no\". For any distribution $p$ with a finite mean, we\nconstruct a distribution $q$ whose mean is well-separated from $p$'s, yet $p$\nand $q$ are not distinguishable with high probability, and $q$ further\npreserves $p$'s moments up to constants. The main consequence is that no\nreasonable estimator can asymptotically achieve better than the sub-Gaussian\nerror rate for any distribution, matching the worst-case result of [LV22]. More\ngenerally, we introduce a new definitional framework to analyze the\nfine-grained optimality of algorithms, which we call \"neighborhood optimality\",\ninterpolating between the unattainably strong \"instance optimality\" and the\ntrivially weak \"admissibility\" definitions. Applying the new framework, we show\nthat median-of-means is neighborhood optimal, up to constant factors. It is\nopen to find a neighborhood-optimal estimator without constant factor\nslackness.\n","authors":["Trung Dang","Jasper C. H. Lee","Maoyuan Song","Paul Valiant"],"pdf_url":"https://arxiv.org/pdf/2311.12784v1.pdf","comment":"27 pages, to appear in NeurIPS 2023. Abstract shortened to fit arXiv\n limit"},{"id":"http://arxiv.org/abs/2011.04923v5","updated":"2023-11-21T18:49:33Z","published":"2020-11-10T06:06:02Z","title":"Topological properties of basins of attraction and expressiveness of\n width bounded neural networks","summary":" In Radhakrishnan et al. [2020], the authors empirically show that\nautoencoders trained with usual SGD methods shape out basins of attraction\naround their training data. We consider network functions of width not\nexceeding the input dimension and prove that in this situation basins of\nattraction are bounded and their complement cannot have bounded components. Our\nconditions in these results are met in several experiments of the latter work\nand we thus address a question posed therein. We also show that under some more\nrestrictive conditions the basins of attraction are path-connected. The\ntightness of the conditions in our results is demonstrated by means of several\nexamples. Finally, the arguments used to prove the above results allow us to\nderive a root cause why scalar-valued neural network functions that fulfill our\nbounded width condition are not dense in spaces of continuous functions.\n","authors":["Hans-Peter Beise","Steve Dias Da Cruz"],"pdf_url":"https://arxiv.org/pdf/2011.04923v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12781v1","updated":"2023-11-21T18:45:52Z","published":"2023-11-21T18:45:52Z","title":"Quantifying Impairment and Disease Severity Using AI Models Trained on\n Healthy Subjects","summary":" Automatic assessment of impairment and disease severity is a key challenge in\ndata-driven medicine. We propose a novel framework to address this challenge,\nwhich leverages AI models trained exclusively on healthy individuals. The\nCOnfidence-Based chaRacterization of Anomalies (COBRA) score exploits the\ndecrease in confidence of these models when presented with impaired or diseased\npatients to quantify their deviation from the healthy population. We applied\nthe COBRA score to address a key limitation of current clinical evaluation of\nupper-body impairment in stroke patients. The gold-standard Fugl-Meyer\nAssessment (FMA) requires in-person administration by a trained assessor for\n30-45 minutes, which restricts monitoring frequency and precludes physicians\nfrom adapting rehabilitation protocols to the progress of each patient. The\nCOBRA score, computed automatically in under one minute, is shown to be\nstrongly correlated with the FMA on an independent test cohort for two\ndifferent data modalities: wearable sensors ($\\rho = 0.845$, 95% CI\n[0.743,0.908]) and video ($\\rho = 0.746$, 95% C.I [0.594, 0.847]). To\ndemonstrate the generalizability of the approach to other conditions, the COBRA\nscore was also applied to quantify severity of knee osteoarthritis from\nmagnetic-resonance imaging scans, again achieving significant correlation with\nan independent clinical assessment ($\\rho = 0.644$, 95% C.I [0.585,0.696]).\n","authors":["Boyang Yu","Aakash Kaku","Kangning Liu","Avinash Parnandi","Emily Fokas","Anita Venkatesan","Natasha Pandit","Rajesh Ranganath","Heidi Schambra","Carlos Fernandez-Granda"],"pdf_url":"https://arxiv.org/pdf/2311.12781v1.pdf","comment":"32 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.12827v3","updated":"2023-11-21T18:43:43Z","published":"2023-05-22T08:39:25Z","title":"Task Arithmetic in the Tangent Space: Improved Editing of Pre-Trained\n Models","summary":" Task arithmetic has recently emerged as a cost-effective and scalable\napproach to edit pre-trained models directly in weight space: By adding the\nfine-tuned weights of different tasks, the model's performance can be improved\non these tasks, while negating them leads to task forgetting. Yet, our\nunderstanding of the effectiveness of task arithmetic and its underlying\nprinciples remains limited. We present a comprehensive study of task arithmetic\nin vision-language models and show that weight disentanglement is the crucial\nfactor that makes it effective. This property arises during pre-training and\nmanifests when distinct directions in weight space govern separate, localized\nregions in function space associated with the tasks. Notably, we show that\nfine-tuning models in their tangent space by linearizing them amplifies weight\ndisentanglement. This leads to substantial performance improvements across\nmultiple task arithmetic benchmarks and diverse models. Building on these\nfindings, we provide theoretical and empirical analyses of the neural tangent\nkernel (NTK) of these models and establish a compelling link between task\narithmetic and the spatial localization of the NTK eigenfunctions. Overall, our\nwork uncovers novel insights into the fundamental mechanisms of task arithmetic\nand offers a more reliable and effective approach to edit pre-trained models\nthrough the NTK linearization.\n","authors":["Guillermo Ortiz-Jimenez","Alessandro Favero","Pascal Frossard"],"pdf_url":"https://arxiv.org/pdf/2305.12827v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09387v2","updated":"2023-11-21T18:31:57Z","published":"2023-11-15T21:30:26Z","title":"Banach-Tarski Embeddings and Transformers","summary":" We introduce a new construction of embeddings of arbitrary recursive data\nstructures into high dimensional vectors. These embeddings provide an\ninterpretable model for the latent state vectors of transformers. We\ndemonstrate that these embeddings can be decoded to the original data structure\nwhen the embedding dimension is sufficiently large. This decoding algorithm has\na natural implementation as a transformer. We also show that these embedding\nvectors can be manipulated directly to perform computations on the underlying\ndata without decoding. As an example we present an algorithm that constructs\nthe embedded parse tree of an embedded token sequence using only vector\noperations in embedding space.\n","authors":["Joshua Maher"],"pdf_url":"https://arxiv.org/pdf/2311.09387v2.pdf","comment":"22 pages, 7 figures. v2: Fixed order of matrix multiplication in\n section 2.4"},{"id":"http://arxiv.org/abs/2206.01255v5","updated":"2023-11-21T18:31:13Z","published":"2022-06-02T19:11:27Z","title":"Compressive Fourier collocation methods for high-dimensional diffusion\n equations with periodic boundary conditions","summary":" High-dimensional Partial Differential Equations (PDEs) are a popular\nmathematical modelling tool, with applications ranging from finance to\ncomputational chemistry. However, standard numerical techniques for solving\nthese PDEs are typically affected by the curse of dimensionality. In this work,\nwe tackle this challenge while focusing on stationary diffusion equations\ndefined over a high-dimensional domain with periodic boundary conditions.\nInspired by recent progress in sparse function approximation in high\ndimensions, we propose a new method called compressive Fourier collocation.\nCombining ideas from compressive sensing and spectral collocation, our method\nreplaces the use of structured collocation grids with Monte Carlo sampling and\nemploys sparse recovery techniques, such as orthogonal matching pursuit and\n$\\ell^1$ minimization, to approximate the Fourier coefficients of the PDE\nsolution. We conduct a rigorous theoretical analysis showing that the\napproximation error of the proposed method is comparable with the best $s$-term\napproximation (with respect to the Fourier basis) to the solution. Using the\nrecently introduced framework of random sampling in bounded Riesz systems, our\nanalysis shows that the compressive Fourier collocation method mitigates the\ncurse of dimensionality with respect to the number of collocation points under\nsufficient conditions on the regularity of the diffusion coefficient. We also\npresent numerical experiments that illustrate the accuracy and stability of the\nmethod for the approximation of sparse and compressible solutions.\n","authors":["Weiqi Wang","Simone Brugiapaglia"],"pdf_url":"https://arxiv.org/pdf/2206.01255v5.pdf","comment":"34 pages, 10 figures"},{"id":"http://arxiv.org/abs/2310.02168v2","updated":"2023-11-21T18:18:49Z","published":"2023-10-03T16:02:36Z","title":"Editing Personality for LLMs","summary":" This paper introduces an innovative task focused on editing the personality\ntraits of Large Language Models (LLMs). This task seeks to adjust the models'\nresponses to opinion-related questions on specified topics since an\nindividual's personality often manifests in the form of their expressed\nopinions, thereby showcasing different personality traits. Specifically, we\nconstruct a new benchmark dataset PersonalityEdit to address this task. Drawing\non the theory in Social Psychology, we isolate three representative traits,\nnamely Neuroticism, Extraversion, and Agreeableness, as the foundation for our\nbenchmark. We then gather data using GPT-4, generating responses that not only\nalign with a specified topic but also embody the targeted personality trait. We\nconduct comprehensive experiments involving various baselines and discuss the\nrepresentation of personality behavior in LLMs. Our intriguing findings uncover\npotential challenges of the proposed task, illustrating several remaining\nissues. We anticipate that our work can provide the NLP community with\ninsights. Code and datasets will be released at\nhttps://github.com/zjunlp/EasyEdit.\n","authors":["Shengyu Mao","Ningyu Zhang","Xiaohan Wang","Mengru Wang","Yunzhi Yao","Yong Jiang","Pengjun Xie","Fei Huang","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02168v2.pdf","comment":"Work in progress, add more experiments"},{"id":"http://arxiv.org/abs/2311.12760v1","updated":"2023-11-21T18:11:26Z","published":"2023-11-21T18:11:26Z","title":"High-resolution Image-based Malware Classification using Multiple\n Instance Learning","summary":" This paper proposes a novel method of classifying malware into families using\nhigh-resolution greyscale images and multiple instance learning to overcome\nadversarial binary enlargement. Current methods of visualisation-based malware\nclassification largely rely on lossy transformations of inputs such as resizing\nto handle the large, variable-sized images. Through empirical analysis and\nexperimentation, it is shown that these approaches cause crucial information\nloss that can be exploited. The proposed solution divides the images into\npatches and uses embedding-based multiple instance learning with a\nconvolutional neural network and an attention aggregation function for\nclassification. The implementation is evaluated on the Microsoft Malware\nClassification dataset and achieves accuracies of up to $96.6\\%$ on\nadversarially enlarged samples compared to the baseline of $22.8\\%$. The Python\ncode is available online at https://github.com/timppeters/MIL-Malware-Images .\n","authors":["Tim Peters","Hikmat Farhat"],"pdf_url":"https://arxiv.org/pdf/2311.12760v1.pdf","comment":"14 pages, 13 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.12754v1","updated":"2023-11-21T17:59:14Z","published":"2023-11-21T17:59:14Z","title":"SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction","summary":" 3D occupancy prediction is an important task for the robustness of\nvision-centric autonomous driving, which aims to predict whether each point is\noccupied in the surrounding 3D space. Existing methods usually require 3D\noccupancy labels to produce meaningful results. However, it is very laborious\nto annotate the occupancy status of each voxel. In this paper, we propose\nSelfOcc to explore a self-supervised way to learn 3D occupancy using only video\nsequences. We first transform the images into the 3D space (e.g., bird's eye\nview) to obtain 3D representation of the scene. We directly impose constraints\non the 3D representations by treating them as signed distance fields. We can\nthen render 2D images of previous and future frames as self-supervision signals\nto learn the 3D representations. We propose an MVS-embedded strategy to\ndirectly optimize the SDF-induced weights with multiple depth proposals. Our\nSelfOcc outperforms the previous best method SceneRF by 58.7% using a single\nframe as input on SemanticKITTI and is the first self-supervised work that\nproduces reasonable 3D occupancy for surround cameras on Occ3D. SelfOcc\nproduces high-quality depth and achieves state-of-the-art results on novel\ndepth synthesis, monocular depth estimation, and surround-view depth estimation\non the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code:\nhttps://github.com/huang-yh/SelfOcc.\n","authors":["Yuanhui Huang","Wenzhao Zheng","Borui Zhang","Jie Zhou","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.12754v1.pdf","comment":"Code is available at: https://github.com/huang-yh/SelfOcc"},{"id":"http://arxiv.org/abs/2310.02129v2","updated":"2023-11-21T17:59:04Z","published":"2023-10-03T15:10:46Z","title":"Unveiling the Pitfalls of Knowledge Editing for Large Language Models","summary":" As the cost associated with fine-tuning Large Language Models (LLMs)\ncontinues to rise, recent research efforts have pivoted towards developing\nmethodologies to edit implicit knowledge embedded within LLMs. Yet, there's\nstill a dark cloud lingering overhead -- will knowledge editing trigger\nbutterfly effect? since it is still unclear whether knowledge editing might\nintroduce side effects that pose potential risks or not. This paper pioneers\nthe investigation into the potential pitfalls associated with knowledge editing\nfor LLMs. To achieve this, we introduce new benchmark datasets and propose\ninnovative evaluation metrics. Our results underline two pivotal concerns: (1)\nKnowledge Conflict: Editing groups of facts that logically clash can magnify\nthe inherent inconsistencies in LLMs-a facet neglected by previous methods. (2)\nKnowledge Distortion: Altering parameters with the aim of editing factual\nknowledge can irrevocably warp the innate knowledge structure of LLMs.\nExperimental results vividly demonstrate that knowledge editing might\ninadvertently cast a shadow of unintended consequences on LLMs, which warrant\nattention and efforts for future works. Code is available at\nhttps://github.com/zjunlp/PitfallsKnowledgeEditing.\n","authors":["Zhoubo Li","Ningyu Zhang","Yunzhi Yao","Mengru Wang","Xi Chen","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2310.02129v2.pdf","comment":"Work in progress, add more experiments"},{"id":"http://arxiv.org/abs/2311.12750v1","updated":"2023-11-21T17:51:30Z","published":"2023-11-21T17:51:30Z","title":"Learning to Optimise Wind Farms with Graph Transformers","summary":" This work proposes a novel data-driven model capable of providing accurate\npredictions for the power generation of all wind turbines in wind farms of\narbitrary layout, yaw angle configurations and wind conditions. The proposed\nmodel functions by encoding a wind farm into a fully-connected graph and\nprocessing the graph representation through a graph transformer. The graph\ntransformer surrogate is shown to generalise well and is able to uncover latent\nstructural patterns within the graph representation of wind farms. It is\ndemonstrated how the resulting surrogate model can be used to optimise yaw\nangle configurations using genetic algorithms, achieving similar levels of\naccuracy to industrially-standard wind farm simulation tools while only taking\na fraction of the computational cost.\n","authors":["Siyi Li","Arnaud Robert","A. Aldo Faisal","Matthew D. Piggott"],"pdf_url":"https://arxiv.org/pdf/2311.12750v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06185v2","updated":"2023-11-21T17:42:42Z","published":"2023-11-10T17:06:28Z","title":"An Automated Pipeline for Tumour-Infiltrating Lymphocyte Scoring in\n Breast Cancer","summary":" Tumour-infiltrating lymphocytes (TILs) are considered as a valuable\nprognostic markers in both triple-negative and human epidermal growth factor\nreceptor 2 (HER2) positive breast cancer. In this study, we introduce an\ninnovative deep learning pipeline based on the Efficient-UNet architecture to\npredict the TILs score for breast cancer whole-slide images (WSIs). We first\nsegment tumour and stromal regions in order to compute a tumour bulk mask. We\nthen detect TILs within the tumour-associated stroma, generating a TILs score\nby closely mirroring the pathologist's workflow. Our method exhibits\nstate-of-the-art performance in segmenting tumour/stroma areas and TILs\ndetection, as demonstrated by internal cross-validation on the TiGER Challenge\ntraining dataset and evaluation on the final leaderboards. Additionally, our\nTILs score proves competitive in predicting survival outcomes within the same\nchallenge, underscoring the clinical relevance and potential of our automated\nTILs scoring pipeline as a breast cancer prognostic tool.\n","authors":["Adam J Shephard","Mostafa Jahanifar","Ruoyu Wang","Muhammad Dawood","Simon Graham","Kastytis Sidlauskas","Syed Ali Khurram","Nasir M Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2311.06185v2.pdf","comment":"5 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2307.16189v6","updated":"2023-11-21T17:35:03Z","published":"2023-07-30T10:03:36Z","title":"Stable Adam Optimization for 16-bit Neural Networks Training","summary":" In this research, we address critical concerns related to the numerical\ninstability observed in 16-bit computations of machine learning models. Such\ninstability, particularly when employing popular optimization algorithms like\nAdam, often leads to unstable training of deep neural networks. This not only\ndisrupts the learning process but also poses significant challenges in\ndeploying dependable models in real-world applications. Our investigation\nidentifies the epsilon hyperparameter as the primary source of this\ninstability. A nuanced exploration reveals that subtle adjustments to epsilon\nwithin 16-bit computations can enhance the numerical stability of Adam,\nenabling more stable training of 16-bit neural networks. We propose a novel,\ndependable approach that leverages updates from the Adam optimizer to bolster\nthe stability of the learning process. Our contributions provide deeper\ninsights into optimization challenges in low-precision computations and offer\nsolutions to ensure the stability of deep neural network training, paving the\nway for their dependable use in various applications.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2307.16189v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12742v1","updated":"2023-11-21T17:31:10Z","published":"2023-11-21T17:31:10Z","title":"Image Transformation for IoT Time-Series Data: A Review","summary":" In the era of the Internet of Things (IoT), where smartphones, built-in\nsystems, wireless sensors, and nearly every smart device connect through local\nnetworks or the internet, billions of smart things communicate with each other\nand generate vast amounts of time-series data. As IoT time-series data is\nhigh-dimensional and high-frequency, time-series classification or regression\nhas been a challenging issue in IoT. Recently, deep learning algorithms have\ndemonstrated superior performance results in time-series data classification in\nmany smart and intelligent IoT applications. However, it is hard to explore the\nhidden dynamic patterns and trends in time-series. Recent studies show that\ntransforming IoT data into images improves the performance of the learning\nmodel. In this paper, we present a review of these studies which use image\ntransformation/encoding techniques in IoT domain. We examine the studies\naccording to their encoding techniques, data types, and application areas.\nLastly, we emphasize the challenges and future dimensions of image\ntransformation.\n","authors":["Duygu Altunkaya","Feyza Yildirim Okay","Suat Ozdemir"],"pdf_url":"https://arxiv.org/pdf/2311.12742v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2311.12741v1","updated":"2023-11-21T17:30:57Z","published":"2023-11-21T17:30:57Z","title":"Content Augmented Graph Neural Networks","summary":" In recent years, graph neural networks (GNNs) have become a popular tool for\nsolving various problems over graphs. In these models, the link structure of\nthe graph is typically exploited and nodes' embeddings are iteratively updated\nbased on adjacent nodes. Nodes' contents are used solely in the form of feature\nvectors, served as nodes' first-layer embeddings. However, the filters or\nconvolutions, applied during iterations/layers to these initial embeddings lead\nto their impact diminish and contribute insignificantly to the final\nembeddings. In order to address this issue, in this paper we propose augmenting\nnodes' embeddings by embeddings generating from their content, at higher GNN\nlayers. More precisely, we propose models wherein a structural embedding using\na GNN and a content embedding are computed for each node. These two are\ncombined using a combination layer to form the embedding of a node at a given\nlayer. We suggest methods such as using an auto-encoder or building a content\ngraph, to generate content embeddings. In the end, by conducting experiments\nover several real-world datasets, we demonstrate the high accuracy and\nperformance of our models.\n","authors":["Fatemeh Gholamzadeh Nasrabadi","AmirHossein Kashani","Pegah Zahedi","Mostafa Haghir Chehreghani"],"pdf_url":"https://arxiv.org/pdf/2311.12741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11995v2","updated":"2023-11-21T17:29:59Z","published":"2023-11-20T18:26:01Z","title":"BrainWash: A Poisoning Attack to Forget in Continual Learning","summary":" Continual learning has gained substantial attention within the deep learning\ncommunity, offering promising solutions to the challenging problem of\nsequential learning. Yet, a largely unexplored facet of this paradigm is its\nsusceptibility to adversarial attacks, especially with the aim of inducing\nforgetting. In this paper, we introduce \"BrainWash,\" a novel data poisoning\nmethod tailored to impose forgetting on a continual learner. By adding the\nBrainWash noise to a variety of baselines, we demonstrate how a trained\ncontinual learner can be induced to forget its previously learned tasks\ncatastrophically, even when using these continual learning baselines. An\nimportant feature of our approach is that the attacker requires no access to\nprevious tasks' data and is armed merely with the model's current parameters\nand the data belonging to the most recent task. Our extensive experiments\nhighlight the efficacy of BrainWash, showcasing degradation in performance\nacross various regularization-based continual learning methods.\n","authors":["Ali Abbasi","Parsa Nooralinejad","Hamed Pirsiavash","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2311.11995v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03489v2","updated":"2023-11-21T17:28:17Z","published":"2023-11-06T19:58:26Z","title":"Leveraging High-Level Synthesis and Large Language Models to Generate,\n Simulate, and Deploy a Uniform Random Number Generator Hardware Design","summary":" We present a new high-level synthesis methodology for using large language\nmodel tools to generate hardware designs. The methodology uses exclusively\nopen-source tools excluding the large language model. As a case study, we use\nour methodology to generate a permuted congruential random number generator\ndesign with a wishbone interface. We verify the functionality and quality of\nthe random number generator design using large language model-generated\nsimulations and the Dieharder randomness test suite. We document all the large\nlanguage model chat logs, Python scripts, Verilog scripts, and simulation\nresults used in the case study. We believe that our method of hardware design\ngeneration coupled with the open source silicon 130 nm design tools will\nrevolutionize application-specific integrated circuit design. Our methodology\nsignificantly lowers the bar to entry when building domain-specific computing\naccelerators for the Internet of Things and proof of concept prototypes for\nlater fabrication in more modern process nodes.\n","authors":["James T. Meech"],"pdf_url":"https://arxiv.org/pdf/2311.03489v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12737v1","updated":"2023-11-21T17:23:05Z","published":"2023-11-21T17:23:05Z","title":"Exploring Graph Classification Techniques Under Low Data Constraints: A\n Comprehensive Study","summary":" This survey paper presents a brief overview of recent research on graph data\naugmentation and few-shot learning. It covers various techniques for graph data\naugmentation, including node and edge perturbation, graph coarsening, and graph\ngeneration, as well as the latest developments in few-shot learning, such as\nmeta-learning and model-agnostic meta-learning. The paper explores these areas\nin depth and delves into further sub classifications. Rule based approaches and\nlearning based approaches are surveyed under graph augmentation techniques.\nFew-Shot Learning on graphs is also studied in terms of metric learning\ntechniques and optimization-based techniques. In all, this paper provides an\nextensive array of techniques that can be employed in solving graph processing\nproblems faced in low-data scenarios.\n","authors":["Kush Kothari","Bhavya Mehta","Reshmika Nambiar","Seema Shrawne"],"pdf_url":"https://arxiv.org/pdf/2311.12737v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11774v2","updated":"2023-11-21T17:16:02Z","published":"2023-05-19T16:01:35Z","title":"Multi-Objective Optimization Using the R2 Utility","summary":" The goal of multi-objective optimization is to identify a collection of\npoints which describe the best possible trade-offs between the multiple\nobjectives. In order to solve this vector-valued optimization problem,\npractitioners often appeal to the use of scalarization functions in order to\ntransform the multi-objective problem into a collection of single-objective\nproblems. This set of scalarized problems can then be solved using traditional\nsingle-objective optimization techniques. In this work, we formalise this\nconvention into a general mathematical framework. We show how this strategy\neffectively recasts the original multi-objective optimization problem into a\nsingle-objective optimization problem defined over sets. An appropriate class\nof objective functions for this new problem is the R2 utility function, which\nis defined as a weighted integral over the scalarized optimization problems. We\nshow that this utility function is a monotone and submodular set function,\nwhich can be optimised effectively using greedy optimization algorithms. We\nanalyse the performance of these greedy algorithms both theoretically and\nempirically. Our analysis largely focusses on Bayesian optimization, which is a\npopular probabilistic framework for black-box optimization.\n","authors":["Ben Tu","Nikolas Kantas","Robert M. Lee","Behrang Shafei"],"pdf_url":"https://arxiv.org/pdf/2305.11774v2.pdf","comment":"The code is available at: https://github.com/benmltu/scalarize"},{"id":"http://arxiv.org/abs/2311.10801v2","updated":"2023-11-21T17:11:55Z","published":"2023-11-17T09:16:59Z","title":"Reinforcement Learning with Maskable Stock Representation for Portfolio\n Management in Customizable Stock Pools","summary":" Portfolio management (PM) is a fundamental financial trading task, which\nexplores the optimal periodical reallocation of capitals into different stocks\nto pursue long-term profits. Reinforcement learning (RL) has recently shown its\npotential to train profitable agents for PM through interacting with financial\nmarkets. However, existing work mostly focuses on fixed stock pools, which is\ninconsistent with investors' practical demand. Specifically, the target stock\npool of different investors varies dramatically due to their discrepancy on\nmarket states and individual investors may temporally adjust stocks they desire\nto trade (e.g., adding one popular stocks), which lead to customizable stock\npools (CSPs). Existing RL methods require to retrain RL agents even with a tiny\nchange of the stock pool, which leads to high computational cost and unstable\nperformance. To tackle this challenge, we propose EarnMore, a rEinforcement\nleARNing framework with Maskable stOck REpresentation to handle PM with CSPs\nthrough one-shot training in a global stock pool (GSP). Specifically, we first\nintroduce a mechanism to mask out the representation of the stocks outside the\ntarget pool. Second, we learn meaningful stock representations through a\nself-supervised masking and reconstruction process. Third, a re-weighting\nmechanism is designed to make the portfolio concentrate on favorable stocks and\nneglect the stocks outside the target pool. Through extensive experiments on 8\nsubset stock pools of the US stock market, we demonstrate that EarnMore\nsignificantly outperforms 14 state-of-the-art baselines in terms of 6 popular\nfinancial metrics with over 40% improvement on profit.\n","authors":["Wentao Zhang","Yilei Zhao","Shuo Sun","Jie Ying","Yonggang Xie","Zitao Song","Xinrun Wang","Bo An"],"pdf_url":"https://arxiv.org/pdf/2311.10801v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17844v2","updated":"2023-11-21T17:08:34Z","published":"2023-06-30T17:59:13Z","title":"The Clock and the Pizza: Two Stories in Mechanistic Explanation of\n Neural Networks","summary":" Do neural networks, trained on well-understood algorithmic tasks, reliably\nrediscover known algorithms for solving those tasks? Several recent studies, on\ntasks ranging from group arithmetic to in-context linear regression, have\nsuggested that the answer is yes. Using modular addition as a prototypical\nproblem, we show that algorithm discovery in neural networks is sometimes more\ncomplex. Small changes to model hyperparameters and initializations can induce\nthe discovery of qualitatively different algorithms from a fixed training set,\nand even parallel implementations of multiple such algorithms. Some networks\ntrained to perform modular addition implement a familiar Clock algorithm;\nothers implement a previously undescribed, less intuitive, but comprehensible\nprocedure which we term the Pizza algorithm, or a variety of even more complex\nprocedures. Our results show that even simple learning problems can admit a\nsurprising diversity of solutions, motivating the development of new tools for\ncharacterizing the behavior of neural networks across their algorithmic phase\nspace.\n","authors":["Ziqian Zhong","Ziming Liu","Max Tegmark","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2306.17844v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.12727v1","updated":"2023-11-21T17:03:21Z","published":"2023-11-21T17:03:21Z","title":"Soft Random Sampling: A Theoretical and Empirical Analysis","summary":" Soft random sampling (SRS) is a simple yet effective approach for efficient\ntraining of large-scale deep neural networks when dealing with massive data.\nSRS selects a subset uniformly at random with replacement from the full data\nset in each epoch. In this paper, we conduct a theoretical and empirical\nanalysis of SRS. First, we analyze its sampling dynamics including data\ncoverage and occupancy. Next, we investigate its convergence with non-convex\nobjective functions and give the convergence rate. Finally, we provide its\ngeneralization performance. We empirically evaluate SRS for image recognition\non CIFAR10 and automatic speech recognition on Librispeech and an in-house\npayload dataset to demonstrate its effectiveness. Compared to existing\ncoreset-based data selection methods, SRS offers a better accuracy-efficiency\ntrade-off. Especially on real-world industrial scale data sets, it is shown to\nbe a powerful training strategy with significant speedup and competitive\nperformance with almost no additional computing cost.\n","authors":["Xiaodong Cui","Ashish Mittal","Songtao Lu","Wei Zhang","George Saon","Brian Kingsbury"],"pdf_url":"https://arxiv.org/pdf/2311.12727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12722v1","updated":"2023-11-21T16:51:33Z","published":"2023-11-21T16:51:33Z","title":"Attacking Motion Planners Using Adversarial Perception Errors","summary":" Autonomous driving (AD) systems are often built and tested in a modular\nfashion, where the performance of different modules is measured using\ntask-specific metrics. These metrics should be chosen so as to capture the\ndownstream impact of each module and the performance of the system as a whole.\nFor example, high perception quality should enable prediction and planning to\nbe performed safely. Even though this is true in general, we show here that it\nis possible to construct planner inputs that score very highly on various\nperception quality metrics but still lead to planning failures. In an analogy\nto adversarial attacks on image classifiers, we call such inputs\n\\textbf{adversarial perception errors} and show they can be systematically\nconstructed using a simple boundary-attack algorithm. We demonstrate the\neffectiveness of this algorithm by finding attacks for two different black-box\nplanners in several urban and highway driving scenarios using the CARLA\nsimulator. Finally, we analyse the properties of these attacks and show that\nthey are isolated in the input space of the planner, and discuss their\nimplications for AD system deployment and testing.\n","authors":["Jonathan Sadeghi","Nicholas A. Lord","John Redford","Romain Mueller"],"pdf_url":"https://arxiv.org/pdf/2311.12722v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12716v1","updated":"2023-11-21T16:43:13Z","published":"2023-11-21T16:43:13Z","title":"minimax: Efficient Baselines for Autocurricula in JAX","summary":" Unsupervised environment design (UED) is a form of automatic curriculum\nlearning for training robust decision-making agents to zero-shot transfer into\nunseen environments. Such autocurricula have received much interest from the RL\ncommunity. However, UED experiments, based on CPU rollouts and GPU model\nupdates, have often required several weeks of training. This compute\nrequirement is a major obstacle to rapid innovation for the field. This work\nintroduces the minimax library for UED training on accelerated hardware. Using\nJAX to implement fully-tensorized environments and autocurriculum algorithms,\nminimax allows the entire training loop to be compiled for hardware\nacceleration. To provide a petri dish for rapid experimentation, minimax\nincludes a tensorized grid-world based on MiniGrid, in addition to reusable\nabstractions for conducting autocurricula in procedurally-generated\nenvironments. With these components, minimax provides strong UED baselines,\nincluding new parallelized variants, which achieve over 120$\\times$ speedups in\nwall time compared to previous implementations when training with equal batch\nsizes. The minimax library is available under the Apache 2.0 license at\nhttps://github.com/facebookresearch/minimax.\n","authors":["Minqi Jiang","Michael Dennis","Edward Grefenstette","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2311.12716v1.pdf","comment":"Presented at ALOE 2023"},{"id":"http://arxiv.org/abs/2311.12715v1","updated":"2023-11-21T16:42:03Z","published":"2023-11-21T16:42:03Z","title":"Attacks of fairness in Federated Learning","summary":" Federated Learning is an important emerging distributed training paradigm\nthat keeps data private on clients. It is now well understood that by\ncontrolling only a small subset of FL clients, it is possible to introduce a\nbackdoor to a federated learning model, in the presence of certain attributes.\nIn this paper, we present a new type of attack that compromises the fairness of\nthe trained model. Fairness is understood to be the attribute-level performance\ndistribution of a trained model. It is particularly salient in domains where,\nfor example, skewed accuracy discrimination between subpopulations could have\ndisastrous consequences. We find that by employing a threat model similar to\nthat of a backdoor attack, an attacker is able to influence the aggregated\nmodel to have an unfair performance distribution between any given set of\nattributes. Furthermore, we find that this attack is possible by controlling\nonly a single client. While combating naturally induced unfairness in FL has\npreviously been discussed in depth, its artificially induced kind has been\nneglected. We show that defending against attacks on fairness should be a\ncritical consideration in any situation where unfairness in a trained model\ncould benefit a user who participated in its training.\n","authors":["Joseph Rance","Filip Svoboda"],"pdf_url":"https://arxiv.org/pdf/2311.12715v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.10852v6","updated":"2023-11-21T16:36:43Z","published":"2022-05-22T15:30:18Z","title":"Relphormer: Relational Graph Transformer for Knowledge Graph\n Representations","summary":" Transformers have achieved remarkable performance in widespread fields,\nincluding natural language processing, computer vision and graph mining.\nHowever, vanilla Transformer architectures have not yielded promising\nimprovements in the Knowledge Graph (KG) representations, where the\ntranslational distance paradigm dominates this area. Note that vanilla\nTransformer architectures struggle to capture the intrinsically heterogeneous\nstructural and semantic information of knowledge graphs. To this end, we\npropose a new variant of Transformer for knowledge graph representations dubbed\nRelphormer. Specifically, we introduce Triple2Seq which can dynamically sample\ncontextualized sub-graph sequences as the input to alleviate the heterogeneity\nissue. We propose a novel structure-enhanced self-attention mechanism to encode\nthe relational information and keep the semantic information within entities\nand relations. Moreover, we utilize masked knowledge modeling for general\nknowledge graph representation learning, which can be applied to various\nKG-based tasks including knowledge graph completion, question answering, and\nrecommendation. Experimental results on six datasets show that Relphormer can\nobtain better performance compared with baselines. Code is available in\nhttps://github.com/zjunlp/Relphormer.\n","authors":["Zhen Bi","Siyuan Cheng","Jing Chen","Xiaozhuan Liang","Feiyu Xiong","Ningyu Zhang"],"pdf_url":"https://arxiv.org/pdf/2205.10852v6.pdf","comment":"Neurocomputing 2023"},{"id":"http://arxiv.org/abs/2311.12711v1","updated":"2023-11-21T16:31:27Z","published":"2023-11-21T16:31:27Z","title":"Regression-Based Analysis of Multimodal Single-Cell Data Integration\n Strategies","summary":" Multimodal single-cell technologies enable the simultaneous collection of\ndiverse data types from individual cells, enhancing our understanding of\ncellular states. However, the integration of these datatypes and modeling the\ninterrelationships between modalities presents substantial computational and\nanalytical challenges in disease biomarker detection and drug discovery.\nEstablished practices rely on isolated methodologies to investigate individual\nmolecular aspects separately, often resulting in inaccurate analyses. To\naddress these obstacles, distinct Machine Learning Techniques are leveraged,\neach of its own kind to model the co-variation of DNA to RNA, and finally to\nsurface proteins in single cells during hematopoietic stem cell development,\nwhich simplifies understanding of underlying cellular mechanisms and immune\nresponses. Experiments conducted on a curated subset of a 300,000-cell time\ncourse dataset, highlights the exceptional performance of Echo State Networks,\nboasting a remarkable state-of-the-art correlation score of 0.94 and 0.895 on\nMulti-omic and CiteSeq datasets. Beyond the confines of this study, these\nfindings hold promise for advancing comprehension of cellular differentiation\nand function, leveraging the potential of Machine Learning.\n","authors":["Bhavya Mehta","Nirmit Deliwala","Madhav Chandane"],"pdf_url":"https://arxiv.org/pdf/2311.12711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07251v2","updated":"2023-11-21T16:26:36Z","published":"2023-05-12T04:53:59Z","title":"Machine-learning-accelerated simulations to enable automatic surface\n reconstruction","summary":" Understanding material surfaces and interfaces is vital in applications like\ncatalysis or electronics. By combining energies from electronic structure with\nstatistical mechanics, ab initio simulations can in principle predict the\nstructure of material surfaces as a function of thermodynamic variables.\nHowever, accurate energy simulations are prohibitive when coupled to the vast\nphase space that must be statistically sampled. Here, we present a bi-faceted\ncomputational loop to predict surface phase diagrams of multi-component\nmaterials that accelerates both the energy scoring and statistical sampling\nmethods. Fast, scalable, and data-efficient machine learning interatomic\npotentials are trained on high-throughput density-functional theory\ncalculations through closed-loop active learning. Markov-chain Monte Carlo\nsampling in the semi-grand canonical ensemble is enabled by using virtual\nsurface sites. The predicted surfaces for GaN(0001), Si(111), and SrTiO3(001)\nare in agreement with past work and suggest that the proposed strategy can\nmodel complex material surfaces and discover previously unreported surface\nterminations.\n","authors":["Xiaochen Du","James K. Damewood","Jaclyn R. Lunger","Reisel Millan","Bilge Yildiz","Lin Li","Rafael Gómez-Bombarelli"],"pdf_url":"https://arxiv.org/pdf/2305.07251v2.pdf","comment":"30 pages main, 15 figures/tables, 5 pages supplementary"},{"id":"http://arxiv.org/abs/2211.08942v2","updated":"2023-11-21T16:14:54Z","published":"2022-11-16T14:44:27Z","title":"Differentially Private Optimizers Can Learn Adversarially Robust Models","summary":" Machine learning models have shone in a variety of domains and attracted\nincreasing attention from both the security and the privacy communities. One\nimportant yet worrying question is: Will training models under the differential\nprivacy (DP) constraint have an unfavorable impact on their adversarial\nrobustness? While previous works have postulated that privacy comes at the cost\nof worse robustness, we give the first theoretical analysis to show that DP\nmodels can indeed be robust and accurate, even sometimes more robust than their\nnaturally-trained non-private counterparts. We observe three key factors that\ninfluence the privacy-robustness-accuracy tradeoff: (1) hyper-parameters for DP\noptimizers are critical; (2) pre-training on public data significantly\nmitigates the accuracy and robustness drop; (3) choice of DP optimizers makes a\ndifference. With these factors set properly, we achieve 90\\% natural accuracy,\n72\\% robust accuracy ($+9\\%$ than the non-private model) under $l_2(0.5)$\nattack, and 69\\% robust accuracy ($+16\\%$ than the non-private model) with\npre-trained SimCLRv2 model under $l_\\infty(4/255)$ attack on CIFAR10 with\n$\\epsilon=2$. In fact, we show both theoretically and empirically that DP\nmodels are Pareto optimal on the accuracy-robustness tradeoff. Empirically, the\nrobustness of DP models is consistently observed across various datasets and\nmodels. We believe our encouraging results are a significant step towards\ntraining models that are private as well as robust.\n","authors":["Yuan Zhang","Zhiqi Bu"],"pdf_url":"https://arxiv.org/pdf/2211.08942v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.02956v3","updated":"2023-11-21T15:57:17Z","published":"2022-05-05T22:56:19Z","title":"Low Dimensional Invariant Embeddings for Universal Geometric Learning","summary":" This paper studies separating invariants: mappings on $D$ dimensional domains\nwhich are invariant to an appropriate group action, and which separate orbits.\nThe motivation for this study comes from the usefulness of separating\ninvariants in proving universality of equivariant neural network architectures.\n We observe that in several cases the cardinality of separating invariants\nproposed in the machine learning literature is much larger than the dimension\n$D$. As a result, the theoretical universal constructions based on these\nseparating invariants is unrealistically large. Our goal in this paper is to\nresolve this issue.\n We show that when a continuous family of semi-algebraic separating invariants\nis available, separation can be obtained by randomly selecting $2D+1 $ of these\ninvariants. We apply this methodology to obtain an efficient scheme for\ncomputing separating invariants for several classical group actions which have\nbeen studied in the invariant learning literature. Examples include matrix\nmultiplication actions on point clouds by permutations, rotations, and various\nother linear groups.\n Often the requirement of invariant separation is relaxed and only generic\nseparation is required. In this case, we show that only $D+1$ invariants are\nrequired. More importantly, generic invariants are often significantly easier\nto compute, as we illustrate by discussing generic and full separation for\nweighted graphs. Finally we outline an approach for proving that separating\ninvariants can be constructed also when the random parameters have finite\nprecision.\n","authors":["Nadav Dym","Steven J. Gortler"],"pdf_url":"https://arxiv.org/pdf/2205.02956v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02143v2","updated":"2023-11-21T15:54:28Z","published":"2023-11-03T17:12:29Z","title":"Pairing-based graph neural network for simulating quantum materials","summary":" We develop a pairing-based graph neural network for simulating quantum\nmany-body systems. Our architecture augments a BCS-type geminal wavefunction\nwith a generalized pair amplitude parameterized by a graph neural network.\nVariational Monte Carlo with our neural network simultaneously provides an\naccurate, flexible, and scalable method for simulating many-electron systems.\nWe apply this method to two-dimensional semiconductor electron-hole bilayers\nand obtain accurate results on a variety of interaction-induced phases,\nincluding the exciton Bose-Einstein condensate, electron-hole superconductor,\nand bilayer Wigner crystal. Our study demonstrates the potential of\nphysically-motivated neural network wavefunctions for quantum materials\nsimulations.\n","authors":["Di Luo","David D. Dai","Liang Fu"],"pdf_url":"https://arxiv.org/pdf/2311.02143v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12689v1","updated":"2023-11-21T15:51:06Z","published":"2023-11-21T15:51:06Z","title":"Fair Text Classification with Wasserstein Independence","summary":" Group fairness is a central research topic in text classification, where\nreaching fair treatment between sensitive groups (e.g. women vs. men) remains\nan open challenge. This paper presents a novel method for mitigating biases in\nneural text classification, agnostic to the model architecture. Considering the\ndifficulty to distinguish fair from unfair information in a text encoder, we\ntake inspiration from adversarial training to induce Wasserstein independence\nbetween representations learned to predict our target label and the ones\nlearned to predict some sensitive attribute. Our approach provides two\nsignificant advantages. Firstly, it does not require annotations of sensitive\nattributes in both testing and training data. This is more suitable for\nreal-life scenarios compared to existing methods that require annotations of\nsensitive attributes at train time. Second, our approach exhibits a comparable\nor better fairness-accuracy trade-off compared to existing methods.\n","authors":["Thibaud Leteno","Antoine Gourru","Charlotte Laclau","Rémi Emonet","Christophe Gravier"],"pdf_url":"https://arxiv.org/pdf/2311.12689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12688v1","updated":"2023-11-21T15:50:37Z","published":"2023-11-21T15:50:37Z","title":"On the Out-of-Distribution Coverage of Combining Split Conformal\n Prediction and Bayesian Deep Learning","summary":" Bayesian deep learning and conformal prediction are two methods that have\nbeen used to convey uncertainty and increase safety in machine learning\nsystems. We focus on combining Bayesian deep learning with split conformal\nprediction and how this combination effects out-of-distribution coverage;\nparticularly in the case of multiclass image classification. We suggest that if\nthe model is generally underconfident on the calibration set, then the\nresultant conformal sets may exhibit worse out-of-distribution coverage\ncompared to simple predictive credible sets. Conversely, if the model is\noverconfident on the calibration set, the use of conformal prediction may\nimprove out-of-distribution coverage. We evaluate prediction sets as a result\nof combining split conformal methods and neural networks trained with (i)\nstochastic gradient descent, (ii) deep ensembles, and (iii) mean-field\nvariational inference. Our results suggest that combining Bayesian deep\nlearning models with split conformal prediction can, in some cases, cause\nunintended consequences such as reducing out-of-distribution coverage.\n","authors":["Paul Scemama","Ariel Kapusta"],"pdf_url":"https://arxiv.org/pdf/2311.12688v1.pdf","comment":"26 pages, 18 figures"},{"id":"http://arxiv.org/abs/2308.16113v2","updated":"2023-11-21T15:50:05Z","published":"2023-08-30T16:14:20Z","title":"survex: an R package for explaining machine learning survival models","summary":" Due to their flexibility and superior performance, machine learning models\nfrequently complement and outperform traditional statistical survival models.\nHowever, their widespread adoption is hindered by a lack of user-friendly tools\nto explain their internal operations and prediction rationales. To tackle this\nissue, we introduce the survex R package, which provides a cohesive framework\nfor explaining any survival model by applying explainable artificial\nintelligence techniques. The capabilities of the proposed software encompass\nunderstanding and diagnosing survival models, which can lead to their\nimprovement. By revealing insights into the decision-making process, such as\nvariable effects and importances, survex enables the assessment of model\nreliability and the detection of biases. Thus, transparency and responsibility\nmay be promoted in sensitive areas, such as biomedical research and healthcare\napplications.\n","authors":["Mikołaj Spytek","Mateusz Krzyziński","Sophie Hanna Langbein","Hubert Baniecki","Marvin N. Wright","Przemysław Biecek"],"pdf_url":"https://arxiv.org/pdf/2308.16113v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12686v1","updated":"2023-11-21T15:47:06Z","published":"2023-11-21T15:47:06Z","title":"Managing ML-Based Application Non-Functional Behavior: A Multi-Model\n Approach","summary":" Modern applications are increasingly driven by Machine Learning (ML) models\nwhose non-deterministic behavior is affecting the entire application life cycle\nfrom design to operation. The pervasive adoption of ML is urgently calling for\napproaches that guarantee a stable non-functional behavior of ML-based\napplications over time and across model changes. To this aim, non-functional\nproperties of ML models, such as privacy, confidentiality, fairness, and\nexplainability, must be monitored, verified, and maintained. This need is even\nmore pressing when modern applications operate in the edge-cloud continuum,\nincreasing their complexity and dynamicity. Existing approaches mostly focus on\ni) implementing classifier selection solutions according to the functional\nbehavior of ML models, ii) finding new algorithmic solutions to this need, such\nas continuous re-training. In this paper, we propose a multi-model approach\nbuilt on dynamic classifier selection, where multiple ML models showing similar\nnon-functional properties are made available to the application and one model\nis selected over time according to (dynamic and unpredictable) contextual\nchanges. Our solution goes beyond the state of the art by providing an\narchitectural and methodological approach that continuously guarantees a stable\nnon-functional behavior of ML-based applications, is applicable to different ML\nmodels, and is driven by non-functional properties assessed on the models\nthemselves. It consists of a two-step process working during application\noperation, where model assessment verifies non-functional properties of ML\nmodels trained and selected at development time, and model substitution\nguarantees a continuous and stable support of non-functional properties. We\nexperimentally evaluate our solution in a real-world scenario focusing on\nnon-functional property fairness.\n","authors":["Marco Anisetti","Claudio A. Ardagna","Nicola Bena","Ernesto Damiani","Paolo G. Panero"],"pdf_url":"https://arxiv.org/pdf/2311.12686v1.pdf","comment":"13 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.12684v1","updated":"2023-11-21T15:46:11Z","published":"2023-11-21T15:46:11Z","title":"Adversarial Reweighting Guided by Wasserstein Distance for Bias\n Mitigation","summary":" The unequal representation of different groups in a sample population can\nlead to discrimination of minority groups when machine learning models make\nautomated decisions. To address these issues, fairness-aware machine learning\njointly optimizes two (or more) metrics aiming at predictive effectiveness and\nlow unfairness. However, the inherent under-representation of minorities in the\ndata makes the disparate treatment of subpopulations less noticeable and\ndifficult to deal with during learning. In this paper, we propose a novel\nadversarial reweighting method to address such \\emph{representation bias}. To\nbalance the data distribution between the majority and the minority groups, our\napproach deemphasizes samples from the majority group. To minimize empirical\nrisk, our method prefers samples from the majority group that are close to the\nminority group as evaluated by the Wasserstein distance. Our theoretical\nanalysis shows the effectiveness of our adversarial reweighting approach.\nExperiments demonstrate that our approach mitigates bias without sacrificing\nclassification accuracy, outperforming related state-of-the-art methods on\nimage and tabular benchmark datasets.\n","authors":["Xuan Zhao","Simone Fabbrizzi","Paula Reyero Lobo","Siamak Ghodsi","Klaus Broelemann","Steffen Staab","Gjergji Kasneci"],"pdf_url":"https://arxiv.org/pdf/2311.12684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2012.12311v3","updated":"2023-11-21T15:40:54Z","published":"2020-12-22T19:32:52Z","title":"Influencer Videos: Unboxing the Mystique","summary":" Influencer marketing has become a very popular tool to reach customers.\nDespite the rapid growth in influencer videos, there has been little research\non the effectiveness of their constituent features in explaining video\nengagement. We study YouTube influencers and analyze their unstructured video\ndata across text, audio and images using an \"interpretable deep learning\"\nframework that accomplishes both goals of prediction and interpretation. Our\nprediction-based approach analyzes unstructured data and finds that \"what is\nsaid\" in words (text) is more influential than \"how it is said\" in imagery\n(images) or acoustics (audio). Our novel interpretation-based approach is\nimplemented after completion of model prediction by analyzing the same source\nof unstructured data to measure importance attributed to the video features. We\neliminate several spurious relationships in two steps, identifying a subset of\nrelationships which are confirmed using theory. We uncover novel findings that\nestablish distinct associations for measures of shallow and deep engagement\nbased on the dual-system framework of human thinking. Our approach is validated\nusing simulated data, and we discuss the learnings from our findings for\ninfluencers and brands.\n","authors":["Prashant Rajaram","Puneet Manchanda"],"pdf_url":"https://arxiv.org/pdf/2012.12311v3.pdf","comment":"45 pages, Online Appendix"},{"id":"http://arxiv.org/abs/2311.12679v1","updated":"2023-11-21T15:37:19Z","published":"2023-11-21T15:37:19Z","title":"BundleMoCap: Efficient, Robust and Smooth Motion Capture from Sparse\n Multiview Videos","summary":" Capturing smooth motions from videos using markerless techniques typically\ninvolves complex processes such as temporal constraints, multiple stages with\ndata-driven regression and optimization, and bundle solving over temporal\nwindows. These processes can be inefficient and require tuning multiple\nobjectives across stages. In contrast, BundleMoCap introduces a novel and\nefficient approach to this problem. It solves the motion capture task in a\nsingle stage, eliminating the need for temporal smoothness objectives while\nstill delivering smooth motions. BundleMoCap outperforms the state-of-the-art\nwithout increasing complexity. The key concept behind BundleMoCap is manifold\ninterpolation between latent keyframes. By relying on a local manifold\nsmoothness assumption, we can efficiently solve a bundle of frames using a\nsingle code. Additionally, the method can be implemented as a sliding window\noptimization and requires only the first frame to be properly initialized,\nreducing the overall computational burden. BundleMoCap's strength lies in its\nability to achieve high-quality motion capture results with simplicity and\nefficiency. More details can be found at https://moverseai.github.io/bundle/.\n","authors":["Georgios Albanis","Nikolaos Zioulis","Kostas Kolomvatsos"],"pdf_url":"https://arxiv.org/pdf/2311.12679v1.pdf","comment":"Published in European Conference on Visual Media Production (CVMP\n '23)"},{"id":"http://arxiv.org/abs/2311.12678v1","updated":"2023-11-21T15:36:20Z","published":"2023-11-21T15:36:20Z","title":"Interpretation of the Transformer and Improvement of the Extractor","summary":" It has been over six years since the Transformer architecture was put\nforward. Surprisingly, the vanilla Transformer architecture is still widely\nused today. One reason is that the lack of deep understanding and comprehensive\ninterpretation of the Transformer architecture makes it more challenging to\nimprove the Transformer architecture. In this paper, we first interpret the\nTransformer architecture comprehensively in plain words based on our\nunderstanding and experiences. The interpretations are further proved and\nverified. These interpretations also cover the Extractor, a family of drop-in\nreplacements for the multi-head self-attention in the Transformer architecture.\nThen, we propose an improvement on a type of the Extractor that outperforms the\nself-attention, without introducing additional trainable parameters.\nExperimental results demonstrate that the improved Extractor performs even\nbetter, showing a way to improve the Transformer architecture.\n","authors":["Zhe Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12674v1","updated":"2023-11-21T15:31:16Z","published":"2023-11-21T15:31:16Z","title":"Contrastive Left-Right Wearable Sensors (IMUs) Consistency Matching for\n HAR","summary":" Machine learning algorithms are improving rapidly, but annotating training\ndata remains a bottleneck for many applications. In this paper, we show how\nreal data can be used for self-supervised learning without any transformations\nby taking advantage of the symmetry present in the activities. Our approach\ninvolves contrastive matching of two different sensors (left and right wrist or\nleg-worn IMUs) to make representations of co-occurring sensor data more similar\nand those of non-co-occurring sensor data more different. We test our approach\non the Opportunity and MM-Fit datasets. In MM-Fit we show significant\nimprovement over the baseline supervised and self-supervised method SimCLR,\nwhile for Opportunity there is significant improvement over the supervised\nbaseline and slight improvement when compared to SimCLR. Moreover, our method\nimproves supervised baselines even when using only a small amount of the data\nfor training. Future work should explore under which conditions our method is\nbeneficial for human activity recognition systems and other related\napplications.\n","authors":["Dominique Nshimyimana","Vitor Fortes Rey","Paul Lukowic"],"pdf_url":"https://arxiv.org/pdf/2311.12674v1.pdf","comment":"Accepted at ABC 2023. The 5th International Conference on Activity\n and Behavior Computing September 7th - 9th, 2023 in Kaiserslautern, Germany\n (Hybrid)"},{"id":"http://arxiv.org/abs/2311.12670v1","updated":"2023-11-21T15:28:44Z","published":"2023-11-21T15:28:44Z","title":"Towards a more inductive world for drug repurposing approaches","summary":" Drug-target interaction (DTI) prediction is a challenging, albeit essential\ntask in drug repurposing. Learning on graph models have drawn special attention\nas they can significantly reduce drug repurposing costs and time commitment.\nHowever, many current approaches require high-demanding additional information\nbesides DTIs that complicates their evaluation process and usability.\nAdditionally, structural differences in the learning architecture of current\nmodels hinder their fair benchmarking. In this work, we first perform an\nin-depth evaluation of current DTI datasets and prediction models through a\nrobust benchmarking process, and show that DTI prediction methods based on\ntransductive models lack generalization and lead to inflated performance when\nevaluated as previously done in the literature, hence not being suited for drug\nrepurposing approaches. We then propose a novel biologically-driven strategy\nfor negative edge subsampling and show through in vitro validation that newly\ndiscovered interactions are indeed true. We envision this work as the\nunderpinning for future fair benchmarking and robust model design. All\ngenerated resources and tools are publicly available as a python package.\n","authors":["Jesus de la Fuente","Guillermo Serrano","Uxía Veleiro","Mikel Casals","Laura Vera","Marija Pizurica","Antonio Pineda-Lucena","Idoia Ochoa","Silve Vicent","Olivier Gevaert","Mikel Hernaez"],"pdf_url":"https://arxiv.org/pdf/2311.12670v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2004.07780v5","updated":"2023-11-21T15:22:43Z","published":"2020-04-16T17:18:49Z","title":"Shortcut Learning in Deep Neural Networks","summary":" Deep learning has triggered the current rise of artificial intelligence and\nis the workhorse of today's machine intelligence. Numerous success stories have\nrapidly spread all over science, industry and society, but its limitations have\nonly recently come into focus. In this perspective we seek to distill how many\nof deep learning's problems can be seen as different symptoms of the same\nunderlying problem: shortcut learning. Shortcuts are decision rules that\nperform well on standard benchmarks but fail to transfer to more challenging\ntesting conditions, such as real-world scenarios. Related issues are known in\nComparative Psychology, Education and Linguistics, suggesting that shortcut\nlearning may be a common characteristic of learning systems, biological and\nartificial alike. Based on these observations, we develop a set of\nrecommendations for model interpretation and benchmarking, highlighting recent\nadvances in machine learning to improve robustness and transferability from the\nlab to real-world applications.\n","authors":["Robert Geirhos","Jörn-Henrik Jacobsen","Claudio Michaelis","Richard Zemel","Wieland Brendel","Matthias Bethge","Felix A. Wichmann"],"pdf_url":"https://arxiv.org/pdf/2004.07780v5.pdf","comment":"perspective article published at Nature Machine Intelligence\n (https://doi.org/10.1038/s42256-020-00257-z)"},{"id":"http://arxiv.org/abs/2311.12666v1","updated":"2023-11-21T15:18:29Z","published":"2023-11-21T15:18:29Z","title":"SSVEP-DAN: A Data Alignment Network for SSVEP-based Brain Computer\n Interfaces","summary":" Steady-state visual-evoked potential (SSVEP)-based brain-computer interfaces\n(BCIs) offer a non-invasive means of communication through high-speed speller\nsystems. However, their efficiency heavily relies on individual training data\nobtained during time-consuming calibration sessions. To address the challenge\nof data insufficiency in SSVEP-based BCIs, we present SSVEP-DAN, the first\ndedicated neural network model designed for aligning SSVEP data across\ndifferent domains, which can encompass various sessions, subjects, or devices.\nOur experimental results across multiple cross-domain scenarios demonstrate\nSSVEP-DAN's capability to transform existing source SSVEP data into\nsupplementary calibration data, significantly enhancing SSVEP decoding accuracy\nin scenarios with limited calibration data. We envision SSVEP-DAN as a catalyst\nfor practical SSVEP-based BCI applications with minimal calibration. The source\ncodes in this work are available at: https://github.com/CECNL/SSVEP-DAN.\n","authors":["Sung-Yu Chen","Chi-Min Chang","Kuan-Jung Chiang","Chun-Shu Wei"],"pdf_url":"https://arxiv.org/pdf/2311.12666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11908v2","updated":"2023-11-21T15:17:00Z","published":"2023-11-20T16:40:29Z","title":"Continual Learning: Applications and the Road Forward","summary":" Continual learning is a sub-field of machine learning, which aims to allow\nmachine learning models to continuously learn on new data, by accumulating\nknowledge without forgetting what was learned in the past. In this work, we\ntake a step back, and ask: \"Why should one care about continual learning in the\nfirst place?\". We set the stage by surveying recent continual learning papers\npublished at three major machine learning conferences, and show that\nmemory-constrained settings dominate the field. Then, we discuss five open\nproblems in machine learning, and even though they seem unrelated to continual\nlearning at first sight, we show that continual learning will inevitably be\npart of their solution. These problems are model-editing, personalization,\non-device learning, faster (re-)training and reinforcement learning. Finally,\nby comparing the desiderata from these unsolved problems and the current\nassumptions in continual learning, we highlight and discuss four future\ndirections for continual learning research. We hope that this work offers an\ninteresting perspective on the future of continual learning, while displaying\nits potential value and the paths we have to pursue in order to make it\nsuccessful. This work is the result of the many discussions the authors had at\nthe Dagstuhl seminar on Deep Continual Learning, in March 2023.\n","authors":["Eli Verwimp","Rahaf Aljundi","Shai Ben-David","Matthias Bethge","Andrea Cossu","Alexander Gepperth","Tyler L. Hayes","Eyke Hüllermeier","Christopher Kanan","Dhireesha Kudithipudi","Christoph H. Lampert","Martin Mundt","Razvan Pascanu","Adrian Popescu","Andreas S. Tolias","Joost van de Weijer","Bing Liu","Vincenzo Lomonaco","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.11908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12657v1","updated":"2023-11-21T15:01:14Z","published":"2023-11-21T15:01:14Z","title":"Carbohydrate NMR chemical shift predictions using E(3) equivariant graph\n neural networks","summary":" Carbohydrates, vital components of biological systems, are well-known for\ntheir structural diversity. Nuclear Magnetic Resonance (NMR) spectroscopy plays\na crucial role in understanding their intricate molecular arrangements and is\nessential in assessing and verifying the molecular structure of organic\nmolecules. An important part of this process is to predict the NMR chemical\nshift from the molecular structure. This work introduces a novel approach that\nleverages E(3) equivariant graph neural networks to predict carbohydrate NMR\nspectra. Notably, our model achieves a substantial reduction in mean absolute\nerror, up to threefold, compared to traditional models that rely solely on\ntwo-dimensional molecular structure. Even with limited data, the model excels,\nhighlighting its robustness and generalization capabilities. The implications\nare far-reaching and go beyond an advanced understanding of carbohydrate\nstructures and spectral interpretation. For example, it could accelerate\nresearch in pharmaceutical applications, biochemistry, and structural biology,\noffering a faster and more reliable analysis of molecular structures.\nFurthermore, our approach is a key step towards a new data-driven era in\nspectroscopy, potentially influencing spectroscopic techniques beyond NMR.\n","authors":["Maria Bånkestad","Keven M. Dorst","Göran Widmalm","Jerk Rönnols"],"pdf_url":"https://arxiv.org/pdf/2311.12657v1.pdf","comment":"13 pages, 9 figures, 2 tables"},{"id":"http://arxiv.org/abs/2304.12023v2","updated":"2023-11-21T14:59:37Z","published":"2023-04-24T11:44:00Z","title":"Multi-channel Speech Separation Using Spatially Selective Deep\n Non-linear Filters","summary":" In a multi-channel separation task with multiple speakers, we aim to recover\nall individual speech signals from the mixture. In contrast to single-channel\napproaches, which rely on the different spectro-temporal characteristics of the\nspeech signals, multi-channel approaches should additionally utilize the\ndifferent spatial locations of the sources for a more powerful separation\nespecially when the number of sources increases. To enhance the spatial\nprocessing in a multi-channel source separation scenario, in this work, we\npropose a deep neural network (DNN) based spatially selective filter (SSF) that\ncan be spatially steered to extract the speaker of interest by initializing a\nrecurrent neural network layer with the target direction. We compare the\nproposed SSF with a common end-to-end direct separation (DS) approach trained\nusing utterance-wise permutation invariant training (PIT), which only\nimplicitly learns to perform spatial filtering. We show that the SSF has a\nclear advantage over a DS approach with the same underlying network\narchitecture when there are more than two speakers in the mixture, which can be\nattributed to a better use of the spatial information. Furthermore, we find\nthat the SSF generalizes much better to additional noise sources that were not\nseen during training and to scenarios with speakers positioned at a similar\nangle.\n","authors":["Kristina Tesch","Timo Gerkmann"],"pdf_url":"https://arxiv.org/pdf/2304.12023v2.pdf","comment":"Accepted version"},{"id":"http://arxiv.org/abs/2302.04181v2","updated":"2023-11-21T14:56:21Z","published":"2023-02-08T16:40:11Z","title":"Attending to Graph Transformers","summary":" Recently, transformer architectures for graphs emerged as an alternative to\nestablished techniques for machine learning with graphs, such as\n(message-passing) graph neural networks. So far, they have shown promising\nempirical results, e.g., on molecular prediction datasets, often attributed to\ntheir ability to circumvent graph neural networks' shortcomings, such as\nover-smoothing and over-squashing. Here, we derive a taxonomy of graph\ntransformer architectures, bringing some order to this emerging field. We\noverview their theoretical properties, survey structural and positional\nencodings, and discuss extensions for important graph classes, e.g., 3D\nmolecular graphs. Empirically, we probe how well graph transformers can recover\nvarious graph properties, how well they can deal with heterophilic graphs, and\nto what extent they prevent over-squashing. Further, we outline open challenges\nand research direction to stimulate future work. Our code is available at\nhttps://github.com/luis-mueller/probing-graph-transformers.\n","authors":["Luis Müller","Mikhail Galkin","Christopher Morris","Ladislav Rampášek"],"pdf_url":"https://arxiv.org/pdf/2302.04181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04158v2","updated":"2023-11-21T14:55:52Z","published":"2023-11-07T17:34:56Z","title":"Computing Approximate $\\ell_p$ Sensitivities","summary":" Recent works in dimensionality reduction for regression tasks have introduced\nthe notion of sensitivity, an estimate of the importance of a specific\ndatapoint in a dataset, offering provable guarantees on the quality of the\napproximation after removing low-sensitivity datapoints via subsampling.\nHowever, fast algorithms for approximating $\\ell_p$ sensitivities, which we\nshow is equivalent to approximate $\\ell_p$ regression, are known for only the\n$\\ell_2$ setting, in which they are termed leverage scores.\n In this work, we provide efficient algorithms for approximating $\\ell_p$\nsensitivities and related summary statistics of a given matrix. In particular,\nfor a given $n \\times d$ matrix, we compute $\\alpha$-approximation to its\n$\\ell_1$ sensitivities at the cost of $O(n/\\alpha)$ sensitivity computations.\nFor estimating the total $\\ell_p$ sensitivity (i.e. the sum of $\\ell_p$\nsensitivities), we provide an algorithm based on importance sampling of\n$\\ell_p$ Lewis weights, which computes a constant factor approximation to the\ntotal sensitivity at the cost of roughly $O(\\sqrt{d})$ sensitivity\ncomputations. Furthermore, we estimate the maximum $\\ell_1$ sensitivity, up to\na $\\sqrt{d}$ factor, using $O(d)$ sensitivity computations. We generalize all\nthese results to $\\ell_p$ norms for $p > 1$. Lastly, we experimentally show\nthat for a wide class of matrices in real-world datasets, the total sensitivity\ncan be quickly approximated and is significantly smaller than the theoretical\nprediction, demonstrating that real-world datasets have low intrinsic effective\ndimensionality.\n","authors":["Swati Padmanabhan","David P. Woodruff","Qiuyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.04158v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12652v1","updated":"2023-11-21T14:53:39Z","published":"2023-11-21T14:53:39Z","title":"FedDRO: Federated Compositional Optimization for Distributionally Robust\n Learning","summary":" Recently, compositional optimization (CO) has gained popularity because of\nits applications in distributionally robust optimization (DRO) and many other\nmachine learning problems. Large-scale and distributed availability of data\ndemands the development of efficient federated learning (FL) algorithms for\nsolving CO problems. Developing FL algorithms for CO is particularly\nchallenging because of the compositional nature of the objective. Moreover,\ncurrent state-of-the-art methods to solve such problems rely on large batch\ngradients (depending on the solution accuracy) not feasible for most practical\nsettings. To address these challenges, in this work, we propose efficient\nFedAvg-type algorithms for solving non-convex CO in the FL setting. We first\nestablish that vanilla FedAvg is not suitable to solve distributed CO problems\nbecause of the data heterogeneity in the compositional objective at each client\nwhich leads to the amplification of bias in the local compositional gradient\nestimates. To this end, we propose a novel FL framework FedDRO that utilizes\nthe DRO problem structure to design a communication strategy that allows FedAvg\nto control the bias in the estimation of the compositional gradient. A key\nnovelty of our work is to develop solution accuracy-independent algorithms that\ndo not require large batch gradients (and function evaluations) for solving\nfederated CO problems. We establish $\\mathcal{O}(\\epsilon^{-2})$ sample and\n$\\mathcal{O}(\\epsilon^{-3/2})$ communication complexity in the FL setting while\nachieving linear speedup with the number of clients. We corroborate our\ntheoretical findings with empirical studies on large-scale DRO problems.\n","authors":["Prashant Khanduri","Chengyin Li","Rafi Ibn Sultan","Yao Qiang","Joerg Kliewer","Dongxiao Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.12652v1.pdf","comment":"38 Pages, 6 Figures"},{"id":"http://arxiv.org/abs/2310.19805v3","updated":"2023-11-21T14:50:12Z","published":"2023-10-07T00:02:05Z","title":"Sample Efficient Reward Augmentation in offline-to-online Reinforcement\n Learning","summary":" Offline-to-online RL can make full use of pre-collected offline datasets to\ninitialize policies, resulting in higher sample efficiency and better\nperformance compared to only using online algorithms alone for policy training.\nHowever, direct fine-tuning of the pre-trained policy tends to result in\nsub-optimal performance. A primary reason is that conservative offline RL\nmethods diminish the agent's capability of exploration, thereby impacting\nonline fine-tuning performance. To encourage agent's exploration during online\nfine-tuning and enhance the overall online fine-tuning performance, we propose\na generalized reward augmentation method called Sample Efficient Reward\nAugmentation (SERA). Specifically, SERA encourages agent to explore by\ncomputing Q conditioned entropy as intrinsic reward. The advantage of SERA is\nthat it can extensively utilize offline pre-trained Q to encourage agent\nuniformly coverage of state space while considering the imbalance between the\ndistributions of high-value and low-value states. Additionally, SERA can be\neffortlessly plugged into various RL algorithms to improve online fine-tuning\nand ensure sustained asymptotic improvement. Moreover, extensive experimental\nresults demonstrate that when conducting offline-to-online problems, SERA\nconsistently and effectively enhances the performance of various offline\nalgorithms.\n","authors":["Ziqi Zhang","Xiao Xiong","Zifeng Zhuang","Jinxin Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2310.19805v3.pdf","comment":"23 pages, 11 Figures, and 6 Tables"},{"id":"http://arxiv.org/abs/2311.12644v1","updated":"2023-11-21T14:44:51Z","published":"2023-11-21T14:44:51Z","title":"Careful Selection and Thoughtful Discarding: Graph Explicit Pooling\n Utilizing Discarded Nodes","summary":" Graph pooling has been increasingly recognized as crucial for Graph Neural\nNetworks (GNNs) to facilitate hierarchical graph representation learning.\nExisting graph pooling methods commonly consist of two stages: selecting\ntop-ranked nodes and discarding the remaining to construct coarsened graph\nrepresentations. However, this paper highlights two key issues with these\nmethods: 1) The process of selecting nodes to discard frequently employs\nadditional Graph Convolutional Networks or Multilayer Perceptrons, lacking a\nthorough evaluation of each node's impact on the final graph representation and\nsubsequent prediction tasks. 2) Current graph pooling methods tend to directly\ndiscard the noise segment (dropped) of the graph without accounting for the\nlatent information contained within these elements. To address the first issue,\nwe introduce a novel Graph Explicit Pooling (GrePool) method, which selects\nnodes by explicitly leveraging the relationships between the nodes and final\nrepresentation vectors crucial for classification. The second issue is\naddressed using an extended version of GrePool (i.e., GrePool+), which applies\na uniform loss on the discarded nodes. This addition is designed to augment the\ntraining process and improve classification accuracy. Furthermore, we conduct\ncomprehensive experiments across 12 widely used datasets to validate our\nproposed method's effectiveness, including the Open Graph Benchmark datasets.\nOur experimental results uniformly demonstrate that GrePool outperforms 14\nbaseline methods for most datasets. Likewise, implementing GrePool+ enhances\nGrePool's performance without incurring additional computational costs.\n","authors":["Chuang Liu","Wenhang Yu","Kuang Gao","Xueqi Ma","Yibing Zhan","Jia Wu","Bo Du","Wenbin Hu"],"pdf_url":"https://arxiv.org/pdf/2311.12644v1.pdf","comment":"14 pages, 7 figures, 4 tables. Submitting to Science China\n Information Sciences"},{"id":"http://arxiv.org/abs/2311.12630v1","updated":"2023-11-21T14:24:21Z","published":"2023-11-21T14:24:21Z","title":"Hierarchical Joint Graph Learning and Multivariate Time Series\n Forecasting","summary":" Multivariate time series is prevalent in many scientific and industrial\ndomains. Modeling multivariate signals is challenging due to their long-range\ntemporal dependencies and intricate interactions--both direct and indirect. To\nconfront these complexities, we introduce a method of representing multivariate\nsignals as nodes in a graph with edges indicating interdependency between them.\nSpecifically, we leverage graph neural networks (GNN) and attention mechanisms\nto efficiently learn the underlying relationships within the time series data.\nMoreover, we suggest employing hierarchical signal decompositions running over\nthe graphs to capture multiple spatial dependencies. The effectiveness of our\nproposed model is evaluated across various real-world benchmark datasets\ndesigned for long-term forecasting tasks. The results consistently showcase the\nsuperiority of our model, achieving an average 23\\% reduction in mean squared\nerror (MSE) compared to existing models.\n","authors":["Juhyeon Kim","Hyungeun Lee","Seungwon Yu","Ung Hwang","Wooyul Jung","Miseon Park","Kijung Yoon"],"pdf_url":"https://arxiv.org/pdf/2311.12630v1.pdf","comment":"Temporal Graph Learning Workshop @ NeurIPS 2023, New Orleans, United\n States"},{"id":"http://arxiv.org/abs/2311.12624v1","updated":"2023-11-21T14:18:28Z","published":"2023-11-21T14:18:28Z","title":"Bridging Algorithmic Information Theory and Machine Learning: A New\n Approach to Kernel Learning","summary":" Machine Learning (ML) and Algorithmic Information Theory (AIT) look at\nComplexity from different points of view. We explore the interface between AIT\nand Kernel Methods (that are prevalent in ML) by adopting an AIT perspective on\nthe problem of learning kernels from data, in kernel ridge regression, through\nthe method of Sparse Kernel Flows. In particular, by looking at the differences\nand commonalities between Minimal Description Length (MDL) and Regularization\nin Machine Learning (RML), we prove that the method of Sparse Kernel Flows is\nthe natural approach to adopt to learn kernels from data. This paper shows that\nit is not necessary to use the statistical route to derive Sparse Kernel Flows\nand that one can directly work with code-lengths and complexities that are\nconcepts that show up in AIT.\n","authors":["Boumediene Hamzi","Marcus Hutter","Houman Owhadi"],"pdf_url":"https://arxiv.org/pdf/2311.12624v1.pdf","comment":"An earlier version of this paper appeared at\n https://www.researchgate.net/publication/371875631_A_note_on_learning_kernels_from_data_from_an_Algorithmic_Information_Theoretic_point_of_view.\n arXiv admin note: text overlap with arXiv:2111.13037, arXiv:2007.05074"},{"id":"http://arxiv.org/abs/2306.03163v2","updated":"2023-11-21T14:10:03Z","published":"2023-06-05T18:17:37Z","title":"How Can We Train Deep Learning Models Across Clouds and Continents? An\n Experimental Study","summary":" Training deep learning models in the cloud or on dedicated hardware is\nexpensive. A more cost-efficient option are hyperscale clouds offering spot\ninstances, a cheap but ephemeral alternative to on-demand resources. As spot\ninstance availability can change depending on the time of day, continent, and\ncloud provider, it could be more cost-efficient to distribute resources over\nthe world. Still, it has not been investigated whether geo-distributed,\ndata-parallel spot deep learning training could be a more cost-efficient\nalternative to centralized training.\n This paper aims to answer the question: Can deep learning models be\ncost-efficiently trained on a global market of spot VMs spanning different data\ncenters and cloud providers? To provide guidance, we extensively evaluate the\ncost and throughput implications of training in different zones, continents,\nand clouds for representative CV, NLP and ASR models. To expand the current\ntraining options further, we compare the scalability potential for hybrid-cloud\nscenarios by adding cloud resources to on-premise hardware to improve training\nthroughput. Finally, we show how leveraging spot instance pricing enables a new\ncost-efficient way to train models with multiple cheap VMs, trumping both more\ncentralized and powerful hardware and even on-demand cloud offerings at\ncompetitive prices.\n","authors":["Alexander Isenko","Ruben Mayer","Hans-Arno Jacobsen"],"pdf_url":"https://arxiv.org/pdf/2306.03163v2.pdf","comment":"Currently in review. Artifacts and Code:\n https://github.com/cirquit/hivemind-multi-cloud"},{"id":"http://arxiv.org/abs/2211.07931v3","updated":"2023-11-21T13:59:55Z","published":"2022-11-15T06:30:57Z","title":"Personalized Federated Learning with Multi-branch Architecture","summary":" Federated learning (FL) is a decentralized machine learning technique that\nenables multiple clients to collaboratively train models without requiring\nclients to reveal their raw data to each other. Although traditional FL trains\na single global model with average performance among clients, statistical data\nheterogeneity across clients has resulted in the development of personalized FL\n(PFL), which trains personalized models with good performance on each client's\ndata. A key challenge with PFL is how to facilitate clients with similar data\nto collaborate more in a situation where each client has data from complex\ndistribution and cannot determine one another's distribution. In this paper, we\npropose a new PFL method (pFedMB) using multi-branch architecture, which\nachieves personalization by splitting each layer of a neural network into\nmultiple branches and assigning client-specific weights to each branch. We also\ndesign an aggregation method to improve the communication efficiency and the\nmodel performance, with which each branch is globally updated with weighted\naveraging by client-specific weights assigned to the branch. pFedMB is simple\nbut effective in facilitating each client to share knowledge with similar\nclients by adjusting the weights assigned to each branch. We experimentally\nshow that pFedMB performs better than the state-of-the-art PFL methods using\nthe CIFAR10 and CIFAR100 datasets.\n","authors":["Junki Mori","Tomoyuki Yoshiyama","Furukawa Ryo","Isamu Teranishi"],"pdf_url":"https://arxiv.org/pdf/2211.07931v3.pdf","comment":"Published at IJCNN 2023"},{"id":"http://arxiv.org/abs/2311.12615v1","updated":"2023-11-21T13:59:00Z","published":"2023-11-21T13:59:00Z","title":"Koopman Learning with Episodic Memory","summary":" Koopman operator theory, a data-driven dynamical systems framework, has found\nsignificant success in learning models from complex, real-world data sets,\nenabling state-of-the-art prediction and control. The greater interpretability\nand lower computational costs of these models, compared to traditional machine\nlearning methodologies, make Koopman learning an especially appealing approach.\nDespite this, little work has been performed on endowing Koopman learning with\nthe ability to learn from its own mistakes. To address this, we equip Koopman\nmethods - developed for predicting non-stationary time-series - with an\nepisodic memory mechanism, enabling global recall of (or attention to) periods\nin time where similar dynamics previously occurred. We find that a basic\nimplementation of Koopman learning with episodic memory leads to significant\nimprovements in prediction on synthetic and real-world data. Our framework has\nconsiderable potential for expansion, allowing for future advances, and opens\nexciting new directions for Koopman learning.\n","authors":["William T. Redman","Dean Huang","Maria Fonoberova","Igor Mezić"],"pdf_url":"https://arxiv.org/pdf/2311.12615v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.10837v3","updated":"2023-11-21T13:58:00Z","published":"2023-10-16T21:23:16Z","title":"Approximating Two-Layer Feedforward Networks for Efficient Transformers","summary":" How to reduce compute and memory requirements of neural networks (NNs)\nwithout sacrificing performance? Many recent works use sparse Mixtures of\nExperts (MoEs) to build resource-efficient large language models (LMs). Here we\nintroduce several novel perspectives on MoEs, presenting a general framework\nthat unifies various methods to approximate two-layer NNs (e.g., feedforward\nblocks of Transformers), including product-key memories (PKMs). Leveraging\ninsights from this framework, we propose methods to improve both MoEs and PKMs.\nUnlike prior work that compares MoEs with dense baselines under the\ncompute-equal condition, our evaluation condition is parameter-equal, which is\ncrucial to properly evaluate LMs. We show that our MoEs are competitive with\nthe dense Transformer-XL on both the WikiText-103 and enwiki8 datasets at two\ndifferent scales, while being much more resource efficient. This demonstrates\nthat MoEs are relevant not only to extremely large LMs but also to any-scale\nresource-efficient LMs. Our code is public.\n","authors":["Róbert Csordás","Kazuki Irie","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2310.10837v3.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.12613v1","updated":"2023-11-21T13:56:44Z","published":"2023-11-21T13:56:44Z","title":"Decentralised Q-Learning for Multi-Agent Markov Decision Processes with\n a Satisfiability Criterion","summary":" In this paper, we propose a reinforcement learning algorithm to solve a\nmulti-agent Markov decision process (MMDP). The goal, inspired by Blackwell's\nApproachability Theorem, is to lower the time average cost of each agent to\nbelow a pre-specified agent-specific bound. For the MMDP, we assume the state\ndynamics to be controlled by the joint actions of agents, but the per-stage\ncosts to only depend on the individual agent's actions. We combine the\nQ-learning algorithm for a weighted combination of the costs of each agent,\nobtained by a gossip algorithm with the Metropolis-Hastings or Multiplicative\nWeights formalisms to modulate the averaging matrix of the gossip. We use\nmultiple timescales in our algorithm and prove that under mild conditions, it\napproximately achieves the desired bounds for each of the agents. We also\ndemonstrate the empirical performance of this algorithm in the more general\nsetting of MMDPs having jointly controlled per-stage costs.\n","authors":["Keshav P. Keval","Vivek S. Borkar"],"pdf_url":"https://arxiv.org/pdf/2311.12613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05858v2","updated":"2023-11-21T13:55:33Z","published":"2023-11-10T03:54:40Z","title":"Layer-wise Auto-Weighting for Non-Stationary Test-Time Adaptation","summary":" Given the inevitability of domain shifts during inference in real-world\napplications, test-time adaptation (TTA) is essential for model adaptation\nafter deployment. However, the real-world scenario of continuously changing\ntarget distributions presents challenges including catastrophic forgetting and\nerror accumulation. Existing TTA methods for non-stationary domain shifts,\nwhile effective, incur excessive computational load, making them impractical\nfor on-device settings. In this paper, we introduce a layer-wise auto-weighting\nalgorithm for continual and gradual TTA that autonomously identifies layers for\npreservation or concentrated adaptation. By leveraging the Fisher Information\nMatrix (FIM), we first design the learning weight to selectively focus on\nlayers associated with log-likelihood changes while preserving unrelated ones.\nThen, we further propose an exponential min-max scaler to make certain layers\nnearly frozen while mitigating outliers. This minimizes forgetting and error\naccumulation, leading to efficient adaptation to non-stationary target\ndistribution. Experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C show our\nmethod outperforms conventional continual and gradual TTA approaches while\nsignificantly reducing computational load, highlighting the importance of\nFIM-based learning weight in adapting to continuously or gradually shifting\ntarget domains.\n","authors":["Junyoung Park","Jin Kim","Hyeongjun Kwon","Ilhoon Yoon","Kwanghoon Sohn"],"pdf_url":"https://arxiv.org/pdf/2311.05858v2.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2308.07121v2","updated":"2023-11-21T13:55:04Z","published":"2023-08-14T13:06:10Z","title":"Active Bird2Vec: Towards End-to-End Bird Sound Monitoring with\n Transformers","summary":" We propose a shift towards end-to-end learning in bird sound monitoring by\ncombining self-supervised (SSL) and deep active learning (DAL). Leveraging\ntransformer models, we aim to bypass traditional spectrogram conversions,\nenabling direct raw audio processing. ActiveBird2Vec is set to generate\nhigh-quality bird sound representations through SSL, potentially accelerating\nthe assessment of environmental changes and decision-making processes for wind\nfarms. Additionally, we seek to utilize the wide variety of bird vocalizations\nthrough DAL, reducing the reliance on extensively labeled datasets by human\nexperts. We plan to curate a comprehensive set of tasks through Huggingface\nDatasets, enhancing future comparability and reproducibility of bioacoustic\nresearch. A comparative analysis between various transformer models will be\nconducted to evaluate their proficiency in bird sound recognition tasks. We aim\nto accelerate the progression of avian bioacoustic research and contribute to\nmore effective conservation strategies.\n","authors":["Lukas Rauch","Raphael Schwinger","Moritz Wirth","Bernhard Sick","Sven Tomforde","Christoph Scholz"],"pdf_url":"https://arxiv.org/pdf/2308.07121v2.pdf","comment":"Accepted @AI4S ECAI2023. This is the author's version of the work"},{"id":"http://arxiv.org/abs/2311.12612v1","updated":"2023-11-21T13:54:08Z","published":"2023-11-21T13:54:08Z","title":"A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of\n Continuous Random Variables","summary":" In this paper, I present a completely new type of upper and lower bounds on\nthe right-tail probabilities of continuous random variables with unbounded\nsupport and with semi-bounded support from the left. The presented upper and\nlower right-tail bounds depend only on the probability density function (PDF),\nits first derivative, and two parameters that are used for tightening the\nbounds. These tail bounds hold under certain conditions that depend on the PDF,\nits first and second derivatives, and the two parameters. The new tail bounds\nare shown to be tight for a wide range of continuous random variables via\nnumerical examples.\n","authors":["Nikola Zlatanov"],"pdf_url":"https://arxiv.org/pdf/2311.12612v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07955v2","updated":"2023-11-21T13:53:36Z","published":"2023-04-17T02:50:18Z","title":"Heterogeneous Domain Adaptation with Positive and Unlabeled Data","summary":" Heterogeneous unsupervised domain adaptation (HUDA) is the most challenging\ndomain adaptation setting where the feature spaces of source and target domains\nare heterogeneous, and the target domain has only unlabeled data. Existing HUDA\nmethods assume that both positive and negative examples are available in the\nsource domain, which may not be satisfied in some real applications. This paper\naddresses a new challenging setting called positive and unlabeled heterogeneous\nunsupervised domain adaptation (PU-HUDA), a HUDA setting where the source\ndomain only has positives. PU-HUDA can also be viewed as an extension of PU\nlearning where the positive and unlabeled examples are sampled from different\ndomains. A naive combination of existing HUDA and PU learning methods is\nineffective in PU-HUDA due to the gap in label distribution between the source\nand target domains. To overcome this issue, we propose a novel method,\npredictive adversarial domain adaptation (PADA), which can predict likely\npositive examples from the unlabeled target data and simultaneously align the\nfeature spaces to reduce the distribution divergence between the whole source\ndata and the likely positive target data. PADA achieves this by a unified\nadversarial training framework for learning a classifier to predict positive\nexamples and a feature transformer to transform the target feature space to\nthat of the source. Specifically, they are both trained to fool a common\ndiscriminator that determines whether the likely positive examples are from the\ntarget or source domain. We experimentally show that PADA outperforms several\nbaseline methods, such as the naive combination of HUDA and PU learning.\n","authors":["Junki Mori","Ryo Furukawa","Isamu Teranishi","Jun Sakuma"],"pdf_url":"https://arxiv.org/pdf/2304.07955v2.pdf","comment":"Accepted by IEEE Big Data 2023 as a regular paper"},{"id":"http://arxiv.org/abs/2311.12602v1","updated":"2023-11-21T13:43:06Z","published":"2023-11-21T13:43:06Z","title":"TouchSDF: A DeepSDF Approach for 3D Shape Reconstruction using\n Vision-Based Tactile Sensing","summary":" Humans rely on their visual and tactile senses to develop a comprehensive 3D\nunderstanding of their physical environment. Recently, there has been a growing\ninterest in exploring and manipulating objects using data-driven approaches\nthat utilise high-resolution vision-based tactile sensors. However, 3D shape\nreconstruction using tactile sensing has lagged behind visual shape\nreconstruction because of limitations in existing techniques, including the\ninability to generalise over unseen shapes, the absence of real-world testing,\nand limited expressive capacity imposed by discrete representations. To address\nthese challenges, we propose TouchSDF, a Deep Learning approach for tactile 3D\nshape reconstruction that leverages the rich information provided by a\nvision-based tactile sensor and the expressivity of the implicit neural\nrepresentation DeepSDF. Our technique consists of two components: (1) a\nConvolutional Neural Network that maps tactile images into local meshes\nrepresenting the surface at the touch location, and (2) an implicit neural\nfunction that predicts a signed distance function to extract the desired 3D\nshape. This combination allows TouchSDF to reconstruct smooth and continuous 3D\nshapes from tactile inputs in simulation and real-world settings, opening up\nresearch avenues for robust 3D-aware representations and improved multimodal\nperception in robotics. Code and supplementary material are available at:\nhttps://touchsdf.github.io/\n","authors":["Mauro Comi","Yijiong Lin","Alex Church","Alessio Tonioni","Laurence Aitchison","Nathan F. Lepora"],"pdf_url":"https://arxiv.org/pdf/2311.12602v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.12601v1","updated":"2023-11-21T13:42:40Z","published":"2023-11-21T13:42:40Z","title":"Deep learning-based detection of morphological features associated with\n hypoxia in H&E breast cancer whole slide images","summary":" Hypoxia occurs when tumour cells outgrow their blood supply, leading to\nregions of low oxygen levels within the tumour. Calculating hypoxia levels can\nbe an important step in understanding the biology of tumours, their clinical\nprogression and response to treatment. This study demonstrates a novel\napplication of deep learning to evaluate hypoxia in the context of breast\ncancer histomorphology. More precisely, we show that Weakly Supervised Deep\nLearning (WSDL) models can accurately detect hypoxia associated features in\nroutine Hematoxylin and Eosin (H&E) whole slide images (WSI). We trained and\nevaluated a deep Multiple Instance Learning model on tiles from WSI H&E tissue\nfrom breast cancer primary sites (n=240) obtaining on average an AUC of 0.87 on\na left-out test set. We also showed significant differences between features of\nhypoxic and normoxic tissue regions as distinguished by the WSDL models. Such\nDL hypoxia H&E WSI detection models could potentially be extended to other\ntumour types and easily integrated into the pathology workflow without\nrequiring additional costly assays.\n","authors":["Petru Manescu","Joseph Geradts","Delmiro Fernandez-Reyes"],"pdf_url":"https://arxiv.org/pdf/2311.12601v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2310.17355v2","updated":"2023-11-21T13:27:01Z","published":"2023-10-26T12:44:33Z","title":"Exploring the Trie of Rules: a fast data structure for the\n representation of association rules","summary":" Association rule mining techniques can generate a large volume of sequential\ndata when implemented on transactional databases. Extracting insights from a\nlarge set of association rules has been found to be a challenging process. When\nexamining a ruleset, the fundamental question is how to summarise and represent\nmeaningful mined knowledge efficiently. Many algorithms and strategies have\nbeen developed to address issue of knowledge extraction; however, the\neffectiveness of this process can be limited by the data structures. A better\ndata structure can sufficiently affect the speed of the knowledge extraction\nprocess. This paper proposes a novel data structure, called the Trie of rules,\nfor storing a ruleset that is generated by association rule mining. The\nresulting data structure is a prefix-tree graph structure made of pre-mined\nrules. This graph stores the rules as paths within the prefix-tree in a way\nthat similar rules overlay each other. Each node in the tree represents a rule\nwhere a consequent is this node, and an antecedent is a path from this node to\nthe root of the tree. The evaluation showed that the proposed representation\ntechnique is promising. It compresses a ruleset with almost no data loss and\nbenefits in terms of time for basic operations such as searching for a specific\nrule and sorting, which is the base for many knowledge discovery methods.\nMoreover, our method demonstrated a significant improvement in traversing time,\nachieving an 8-fold increase compared to traditional data structures.\n","authors":["Mikhail Kudriavtsev","Marija Bezbradica","Andrew McCarren"],"pdf_url":"https://arxiv.org/pdf/2310.17355v2.pdf","comment":"12 pages, 13 figures, preprint of journal article"},{"id":"http://arxiv.org/abs/2311.12590v1","updated":"2023-11-21T13:26:33Z","published":"2023-11-21T13:26:33Z","title":"ChronoPscychosis: Temporal Segmentation and Its Impact on Schizophrenia\n Classification Using Motor Activity Data","summary":" Schizophrenia is a complicated mental illness characterized by a broad\nspectrum of symptoms affecting cognition, behavior, and emotion. The task of\nidentifying reliable biomarkers to classify Schizophrenia accurately continues\nto be a challenge in the field of psychiatry. We investigate the temporal\npatterns within the motor activity data as a potential key to enhancing the\ncategorization of individuals with Schizophrenia, using the dataset having\nmotor activity recordings of 22 Schizophrenia patients and 32 control subjects.\nThe dataset contains per-minute motor activity measurements collected for an\naverage of 12.7 days in a row for each participant. We dissect each day into\nsegments (Twelve, Eight, six, four, three, and two parts) and evaluate their\nimpact on classification. We employ sixteen statistical features within these\ntemporal segments and train them on Seven machine learning models to get deeper\ninsights. LightGBM model outperforms the other six models. Our results indicate\nthat the temporal segmentation significantly improves the classification, with\nAUC-ROC = 0.93, F1 score = 0.84( LightGBM- without any segmentation) and\nAUC-ROC = 0.98, F1 score = 0.93( LightGBM- with segmentation). Distinguishing\nbetween diurnal and nocturnal segments amplifies the differences between\nSchizophrenia patients and controls. However, further subdivisions into smaller\ntime segments do not affect the AUC- ROC significantly. Morning, afternoon,\nevening, and night partitioning gives similar classification performance to\nday-night partitioning. These findings are valuable as they indicate that\nextensive temporal classification beyond distinguishing between day and night\ndoes not yield substantial results, offering an efficient approach for further\nclassification, early diagnosis, and monitoring of Schizophrenia.\n","authors":["Pradnya Rajendra Jadhav","Raviprasad Aduri"],"pdf_url":"https://arxiv.org/pdf/2311.12590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12589v1","updated":"2023-11-21T13:26:13Z","published":"2023-11-21T13:26:13Z","title":"Improving Source-Free Target Adaptation with Vision Transformers\n Leveraging Domain Representation Images","summary":" Unsupervised Domain Adaptation (UDA) methods facilitate knowledge transfer\nfrom a labeled source domain to an unlabeled target domain, navigating the\nobstacle of domain shift. While Convolutional Neural Networks (CNNs) are a\nstaple in UDA, the rise of Vision Transformers (ViTs) provides new avenues for\ndomain generalization. This paper presents an innovative method to bolster ViT\nperformance in source-free target adaptation, beginning with an evaluation of\nhow key, query, and value elements affect ViT outcomes. Experiments indicate\nthat altering the key component has negligible effects on Transformer\nperformance. Leveraging this discovery, we introduce Domain Representation\nImages (DRIs), feeding embeddings through the key element. DRIs act as\ndomain-specific markers, effortlessly merging with the training regimen. To\nassess our method, we perform target adaptation tests on the Cross Instance DRI\nsource-only (SO) control. We measure the efficacy of target adaptation with and\nwithout DRIs, against existing benchmarks like SHOT-B* and adaptations via\nCDTrans. Findings demonstrate that excluding DRIs offers limited gains over\nSHOT-B*, while their inclusion in the key segment boosts average precision\npromoting superior domain generalization. This research underscores the vital\nrole of DRIs in enhancing ViT efficiency in UDA scenarios, setting a precedent\nfor further domain adaptation explorations.\n","authors":["Gauransh Sawhney","Daksh Dave","Adeel Ahmed","Jiechao Gao","Khalid Saleem"],"pdf_url":"https://arxiv.org/pdf/2311.12589v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.09355v5","updated":"2023-11-21T13:12:21Z","published":"2023-04-19T00:33:59Z","title":"To Compress or Not to Compress- Self-Supervised Learning and Information\n Theory: A Review","summary":" Deep neural networks excel in supervised learning tasks but are constrained\nby the need for extensive labeled data. Self-supervised learning emerges as a\npromising alternative, allowing models to learn without explicit labels.\nInformation theory, and notably the information bottleneck principle, has been\npivotal in shaping deep neural networks. This principle focuses on optimizing\nthe trade-off between compression and preserving relevant information,\nproviding a foundation for efficient network design in supervised contexts.\nHowever, its precise role and adaptation in self-supervised learning remain\nunclear. In this work, we scrutinize various self-supervised learning\napproaches from an information-theoretic perspective, introducing a unified\nframework that encapsulates the \\textit{self-supervised information-theoretic\nlearning problem}. We weave together existing research into a cohesive\nnarrative, delve into contemporary self-supervised methodologies, and spotlight\npotential research avenues and inherent challenges. Additionally, we discuss\nthe empirical evaluation of information-theoretic quantities and their\nestimation methods. Overall, this paper furnishes an exhaustive review of the\nintersection of information theory, self-supervised learning, and deep neural\nnetworks.\n","authors":["Ravid Shwartz-Ziv","Yann LeCun"],"pdf_url":"https://arxiv.org/pdf/2304.09355v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11494v2","updated":"2023-11-21T13:11:36Z","published":"2023-07-21T10:56:36Z","title":"Predict, Refine, Synthesize: Self-Guiding Diffusion Models for\n Probabilistic Time Series Forecasting","summary":" Diffusion models have achieved state-of-the-art performance in generative\nmodeling tasks across various domains. Prior works on time series diffusion\nmodels have primarily focused on developing conditional models tailored to\nspecific forecasting or imputation tasks. In this work, we explore the\npotential of task-agnostic, unconditional diffusion models for several time\nseries applications. We propose TSDiff, an unconditionally-trained diffusion\nmodel for time series. Our proposed self-guidance mechanism enables\nconditioning TSDiff for downstream tasks during inference, without requiring\nauxiliary networks or altering the training procedure. We demonstrate the\neffectiveness of our method on three different time series tasks: forecasting,\nrefinement, and synthetic data generation. First, we show that TSDiff is\ncompetitive with several task-specific conditional forecasting methods\n(predict). Second, we leverage the learned implicit probability density of\nTSDiff to iteratively refine the predictions of base forecasters with reduced\ncomputational overhead over reverse diffusion (refine). Notably, the generative\nperformance of the model remains intact -- downstream forecasters trained on\nsynthetic samples from TSDiff outperform forecasters that are trained on\nsamples from other state-of-the-art generative time series models, occasionally\neven outperforming models trained on real data (synthesize).\n","authors":["Marcel Kollovieh","Abdul Fatir Ansari","Michael Bohlke-Schneider","Jasper Zschiegner","Hao Wang","Yuyang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.11494v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12579v1","updated":"2023-11-21T12:50:24Z","published":"2023-11-21T12:50:24Z","title":"Machine-Guided Discovery of a Real-World Rogue Wave Model","summary":" Big data and large-scale machine learning have had a profound impact on\nscience and engineering, particularly in fields focused on forecasting and\nprediction. Yet, it is still not clear how we can use the superior pattern\nmatching abilities of machine learning models for scientific discovery. This is\nbecause the goals of machine learning and science are generally not aligned. In\naddition to being accurate, scientific theories must also be causally\nconsistent with the underlying physical process and allow for human analysis,\nreasoning, and manipulation to advance the field. In this paper, we present a\ncase study on discovering a new symbolic model for oceanic rogue waves from\ndata using causal analysis, deep learning, parsimony-guided model selection,\nand symbolic regression. We train an artificial neural network on causal\nfeatures from an extensive dataset of observations from wave buoys, while\nselecting for predictive performance and causal invariance. We apply symbolic\nregression to distill this black-box model into a mathematical equation that\nretains the neural network's predictive capabilities, while allowing for\ninterpretation in the context of existing wave theory. The resulting model\nreproduces known behavior, generates well-calibrated probabilities, and\nachieves better predictive scores on unseen data than current theory. This\nshowcases how machine learning can facilitate inductive scientific discovery,\nand paves the way for more accurate rogue wave forecasting.\n","authors":["Dion Häfner","Johannes Gemmrich","Markus Jochum"],"pdf_url":"https://arxiv.org/pdf/2311.12579v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02081v2","updated":"2023-11-21T12:43:30Z","published":"2022-12-05T07:52:08Z","title":"YolOOD: Utilizing Object Detection Concepts for Multi-Label\n Out-of-Distribution Detection","summary":" Out-of-distribution (OOD) detection has attracted a large amount of attention\nfrom the machine learning research community in recent years due to its\nimportance in deployed systems. Most of the previous studies focused on the\ndetection of OOD samples in the multi-class classification task. However, OOD\ndetection in the multi-label classification task, a more common real-world use\ncase, remains an underexplored domain. In this research, we propose YolOOD - a\nmethod that utilizes concepts from the object detection domain to perform OOD\ndetection in the multi-label classification task. Object detection models have\nan inherent ability to distinguish between objects of interest\n(in-distribution) and irrelevant objects (e.g., OOD objects) in images that\ncontain multiple objects belonging to different class categories. These\nabilities allow us to convert a regular object detection model into an image\nclassifier with inherent OOD detection capabilities with just minor changes. We\ncompare our approach to state-of-the-art OOD detection methods and demonstrate\nYolOOD's ability to outperform these methods on a comprehensive suite of\nin-distribution and OOD benchmark datasets.\n","authors":["Alon Zolfi","Guy Amit","Amit Baras","Satoru Koda","Ikuya Morikawa","Yuval Elovici","Asaf Shabtai"],"pdf_url":"https://arxiv.org/pdf/2212.02081v2.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.12573v1","updated":"2023-11-21T12:38:05Z","published":"2023-11-21T12:38:05Z","title":"Moderating Model Marketplaces: Platform Governance Puzzles for AI\n Intermediaries","summary":" The AI development community is increasingly making use of hosting\nintermediaries such as Hugging Face provide easy access to user-uploaded models\nand training data. These model marketplaces lower technical deployment barriers\nfor hundreds of thousands of users, yet can be used in numerous potentially\nharmful and illegal ways. In this article, we explain ways in which AI systems,\nwhich can both `contain' content and be open-ended tools, present one of the\ntrickiest platform governance challenges seen to date. We provide case studies\nof several incidents across three illustrative platforms -- Hugging Face,\nGitHub and Civitai -- to examine how model marketplaces moderate models.\nBuilding on this analysis, we outline important (and yet nevertheless limited)\npractices that industry has been developing to respond to moderation demands:\nlicensing, access and use restrictions, automated content moderation, and open\npolicy development. While the policy challenge at hand is a considerable one,\nwe conclude with some ideas as to how platforms could better mobilize resources\nto act as a careful, fair, and proportionate regulatory access point.\n","authors":["Robert Gorwa","Michael Veale"],"pdf_url":"https://arxiv.org/pdf/2311.12573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08051v3","updated":"2023-11-21T12:36:49Z","published":"2023-10-12T05:52:54Z","title":"LGL-BCI: A Lightweight Geometric Learning Framework for Motor\n Imagery-Based Brain-Computer Interfaces","summary":" Brain-Computer Interfaces (BCIs) are a groundbreaking technology for\ninteracting with external devices using brain signals. Despite advancements,\nelectroencephalogram (EEG)-based Motor Imagery (MI) tasks face challenges like\namplitude and phase variability, and complex spatial correlations, with a need\nfor smaller model size and faster inference. This study introduces the LGL-BCI\nframework, employing a Geometric Deep Learning Framework for EEG processing in\nnon-Euclidean metric spaces, particularly the Symmetric Positive Definite (SPD)\nManifold space. LGL-BCI offers robust EEG data representation and captures\nspatial correlations. We propose an EEG channel selection solution via a\nfeature decomposition algorithm to reduce SPD matrix dimensionality, with a\nlossless transformation boosting inference speed. Extensive experiments show\nLGL-BCI's superior accuracy and efficiency compared to current solutions,\nhighlighting geometric deep learning's potential in MI-BCI applications. The\nefficiency, assessed on two public EEG datasets and two real-world EEG devices,\nsignificantly outperforms the state-of-the-art solution in accuracy ($82.54\\%$\nversus $62.22\\%$) with fewer parameters (64.9M compared to 183.7M).\n","authors":["Jianchao Lu","Yuzhe Tian","Yang Zhang","Jiaqi Ge","Quan Z. Sheng","Xi Zheng"],"pdf_url":"https://arxiv.org/pdf/2310.08051v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12570v1","updated":"2023-11-21T12:34:00Z","published":"2023-11-21T12:34:00Z","title":"BEND: Benchmarking DNA Language Models on biologically meaningful tasks","summary":" The genome sequence contains the blueprint for governing cellular processes.\nWhile the availability of genomes has vastly increased over the last decades,\nexperimental annotation of the various functional, non-coding and regulatory\nelements encoded in the DNA sequence remains both expensive and challenging.\nThis has sparked interest in unsupervised language modeling of genomic DNA, a\nparadigm that has seen great success for protein sequence data. Although\nvarious DNA language models have been proposed, evaluation tasks often differ\nbetween individual works, and might not fully recapitulate the fundamental\nchallenges of genome annotation, including the length, scale and sparsity of\nthe data. In this study, we introduce BEND, a Benchmark for DNA language\nmodels, featuring a collection of realistic and biologically meaningful\ndownstream tasks defined on the human genome. We find that embeddings from\ncurrent DNA LMs can approach performance of expert methods on some tasks, but\nonly capture limited information about long-range features. BEND is available\nat https://github.com/frederikkemarin/BEND.\n","authors":["Frederikke Isa Marin","Felix Teufel","Marc Horrender","Dennis Madsen","Dennis Pultz","Ole Winther","Wouter Boomsma"],"pdf_url":"https://arxiv.org/pdf/2311.12570v1.pdf","comment":"10 pages, 1 figure, 3 tables, code available at\n https://github.com/frederikkemarin/BEND"},{"id":"http://arxiv.org/abs/2311.12569v1","updated":"2023-11-21T12:32:38Z","published":"2023-11-21T12:32:38Z","title":"Differentiable Sampling of Categorical Distributions Using the\n CatLog-Derivative Trick","summary":" Categorical random variables can faithfully represent the discrete and\nuncertain aspects of data as part of a discrete latent variable model. Learning\nin such models necessitates taking gradients with respect to the parameters of\nthe categorical probability distributions, which is often intractable due to\ntheir combinatorial nature. A popular technique to estimate these otherwise\nintractable gradients is the Log-Derivative trick. This trick forms the basis\nof the well-known REINFORCE gradient estimator and its many extensions. While\nthe Log-Derivative trick allows us to differentiate through samples drawn from\ncategorical distributions, it does not take into account the discrete nature of\nthe distribution itself. Our first contribution addresses this shortcoming by\nintroducing the CatLog-Derivative trick - a variation of the Log-Derivative\ntrick tailored towards categorical distributions. Secondly, we use the\nCatLog-Derivative trick to introduce IndeCateR, a novel and unbiased gradient\nestimator for the important case of products of independent categorical\ndistributions with provably lower variance than REINFORCE. Thirdly, we\nempirically show that IndeCateR can be efficiently implemented and that its\ngradient estimates have significantly lower bias and variance for the same\nnumber of samples compared to the state of the art.\n","authors":["Lennert De Smet","Emanuele Sansone","Pedro Zuidberg Dos Martires"],"pdf_url":"https://arxiv.org/pdf/2311.12569v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12566v1","updated":"2023-11-21T12:26:14Z","published":"2023-11-21T12:26:14Z","title":"Variational Elliptical Processes","summary":" We present elliptical processes, a family of non-parametric probabilistic\nmodels that subsume Gaussian processes and Student's t processes. This\ngeneralization includes a range of new heavy-tailed behaviors while retaining\ncomputational tractability. Elliptical processes are based on a representation\nof elliptical distributions as a continuous mixture of Gaussian distributions.\nWe parameterize this mixture distribution as a spline normalizing flow, which\nwe train using variational inference. The proposed form of the variational\nposterior enables a sparse variational elliptical process applicable to\nlarge-scale problems. We highlight advantages compared to Gaussian processes\nthrough regression and classification experiments. Elliptical processes can\nsupersede Gaussian processes in several settings, including cases where the\nlikelihood is non-Gaussian or when accurate tail modeling is essential.\n","authors":["Maria Bånkestad","Jens Sjölund","Jalil Taghia","Thomas B. Schöon"],"pdf_url":"https://arxiv.org/pdf/2311.12566v1.pdf","comment":"14 pages, 15 figures, appendix 9 pages"},{"id":"http://arxiv.org/abs/2311.12564v1","updated":"2023-11-21T12:23:58Z","published":"2023-11-21T12:23:58Z","title":"Summary of the DISPLACE Challenge 2023 -- DIarization of SPeaker and\n LAnguage in Conversational Environments","summary":" In multi-lingual societies, where multiple languages are spoken in a small\ngeographic vicinity, informal conversations often involve mix of languages.\nExisting speech technologies may be inefficient in extracting information from\nsuch conversations, where the speech data is rich in diversity with multiple\nlanguages and speakers. The DISPLACE (DIarization of SPeaker and LAnguage in\nConversational Environments) challenge constitutes an open-call for evaluating\nand bench-marking the speaker and language diarization technologies on this\nchallenging condition. The challenge entailed two tracks: Track-1 focused on\nspeaker diarization (SD) in multilingual situations while, Track-2 addressed\nthe language diarization (LD) in a multi-speaker scenario. Both the tracks were\nevaluated using the same underlying audio data. To facilitate this evaluation,\na real-world dataset featuring multilingual, multi-speaker conversational\nfar-field speech was recorded and distributed. Furthermore, a baseline system\nwas made available for both SD and LD task which mimicked the state-of-art in\nthese tasks. The challenge garnered a total of $42$ world-wide registrations\nand received a total of $19$ combined submissions for Track-1 and Track-2. This\npaper describes the challenge, details of the datasets, tasks, and the baseline\nsystem. Additionally, the paper provides a concise overview of the submitted\nsystems in both tracks, with an emphasis given to the top performing systems.\nThe paper also presents insights and future perspectives for SD and LD tasks,\nfocusing on the key challenges that the systems need to overcome before\nwide-spread commercial deployment on such conversations.\n","authors":["Shikha Baghel","Shreyas Ramoji","Somil Jain","Pratik Roy Chowdhuri","Prachi Singh","Deepu Vijayasenan","Sriram Ganapathy"],"pdf_url":"https://arxiv.org/pdf/2311.12564v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2110.09076v2","updated":"2023-11-21T12:23:27Z","published":"2021-10-18T07:55:39Z","title":"An actor-critic algorithm with policy gradients to solve the job shop\n scheduling problem using deep double recurrent agents","summary":" There is a growing interest in integrating machine learning techniques and\noptimization to solve challenging optimization problems. In this work, we\npropose a deep reinforcement learning methodology for the job shop scheduling\nproblem (JSSP). The aim is to build up a greedy-like heuristic able to learn on\nsome distribution of JSSP instances, different in the number of jobs and\nmachines. The need for fast scheduling methods is well known, and it arises in\nmany areas, from transportation to healthcare. We model the JSSP as a Markov\nDecision Process and then we exploit the efficacy of reinforcement learning to\nsolve the problem. We adopt an actor-critic scheme, where the action taken by\nthe agent is influenced by policy considerations on the state-value function.\nThe procedures are adapted to take into account the challenging nature of JSSP,\nwhere the state and the action space change not only for every instance but\nalso after each decision. To tackle the variability in the number of jobs and\noperations in the input, we modeled the agent using two incident LSTM models, a\nspecial type of deep neural network. Experiments show the algorithm reaches\ngood solutions in a short time, proving that is possible to generate new greedy\nheuristics just from learning-based methodologies. Benchmarks have been\ngenerated in comparison with the commercial solver CPLEX. As expected, the\nmodel can generalize, to some extent, to larger problems or instances\noriginated by a different distribution from the one used in training.\n","authors":["Marta Monaci","Valerio Agasucci","Giorgio Grani"],"pdf_url":"https://arxiv.org/pdf/2110.09076v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13193v2","updated":"2023-11-21T12:17:40Z","published":"2022-10-24T13:10:06Z","title":"Langevin dynamics based algorithm e-TH$\\varepsilon$O POULA for\n stochastic optimization problems with discontinuous stochastic gradient","summary":" We introduce a new Langevin dynamics based algorithm, called\ne-TH$\\varepsilon$O POULA, to solve optimization problems with discontinuous\nstochastic gradients which naturally appear in real-world applications such as\nquantile estimation, vector quantization, CVaR minimization, and regularized\noptimization problems involving ReLU neural networks. We demonstrate both\ntheoretically and numerically the applicability of the e-TH$\\varepsilon$O POULA\nalgorithm. More precisely, under the conditions that the stochastic gradient is\nlocally Lipschitz in average and satisfies a certain convexity at infinity\ncondition, we establish non-asymptotic error bounds for e-TH$\\varepsilon$O\nPOULA in Wasserstein distances and provide a non-asymptotic estimate for the\nexpected excess risk, which can be controlled to be arbitrarily small. Three\nkey applications in finance and insurance are provided, namely, multi-period\nportfolio optimization, transfer learning in multi-period portfolio\noptimization, and insurance claim prediction, which involve neural networks\nwith (Leaky)-ReLU activation functions. Numerical experiments conducted using\nreal-world datasets illustrate the superior empirical performance of\ne-TH$\\varepsilon$O POULA compared to SGLD, TUSLA, ADAM, and AMSGrad in terms of\nmodel accuracy.\n","authors":["Dong-Young Lim","Ariel Neufeld","Sotirios Sabanis","Ying Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.13193v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12561v1","updated":"2023-11-21T12:15:28Z","published":"2023-11-21T12:15:28Z","title":"Convolutional Neural Networks for Neuroimaging in Parkinson's Disease:\n Is Preprocessing Needed?","summary":" Spatial and intensity normalization are nowadays a prerequisite for\nneuroimaging analysis. Influenced by voxel-wise and other univariate\ncomparisons, where these corrections are key, they are commonly applied to any\ntype of analysis and imaging modalities. Nuclear imaging modalities such as\nPET-FDG or FP-CIT SPECT, a common modality used in Parkinson's Disease\ndiagnosis, are especially dependent on intensity normalization. However, these\nsteps are computationally expensive and furthermore, they may introduce\ndeformations in the images, altering the information contained in them.\nConvolutional Neural Networks (CNNs), for their part, introduce position\ninvariance to pattern recognition, and have been proven to classify objects\nregardless of their orientation, size, angle, etc. Therefore, a question\narises: how well can CNNs account for spatial and intensity differences when\nanalysing nuclear brain imaging? Are spatial and intensity normalization still\nneeded? To answer this question, we have trained four different CNN models\nbased on well-established architectures, using or not different spatial and\nintensity normalization preprocessing. The results show that a sufficiently\ncomplex model such as our three-dimensional version of the ALEXNET can\neffectively account for spatial differences, achieving a diagnosis accuracy of\n94.1% with an area under the ROC curve of 0.984. The visualization of the\ndifferences via saliency maps shows that these models are correctly finding\npatterns that match those found in the literature, without the need of applying\nany complex spatial normalization procedure. However, the intensity\nnormalization -- and its type -- is revealed as very influential in the results\nand accuracy of the trained model, and therefore must be well accounted.\n","authors":["Francisco J. Martinez-Murcia","Juan M. Górriz","Javier Ramírez","Andrés Ortiz"],"pdf_url":"https://arxiv.org/pdf/2311.12561v1.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.12550v1","updated":"2023-11-21T11:59:16Z","published":"2023-11-21T11:59:16Z","title":"Explainable Anomaly Detection using Masked Latent Generative Modeling","summary":" We present a novel time series anomaly detection method that achieves\nexcellent detection accuracy while offering a superior level of explainability.\nOur proposed method, TimeVQVAE-AD, leverages masked generative modeling adapted\nfrom the cutting-edge time series generation method known as TimeVQVAE. The\nprior model is trained on the discrete latent space of a time-frequency domain.\nNotably, the dimensional semantics of the time-frequency domain are preserved\nin the latent space, enabling us to compute anomaly scores across different\nfrequency bands, which provides a better insight into the detected anomalies.\nAdditionally, the generative nature of the prior model allows for sampling\nlikely normal states for detected anomalies, enhancing the explainability of\nthe detected anomalies through counterfactuals. Our experimental evaluation on\nthe UCR Time Series Anomaly archive demonstrates that TimeVQVAE-AD\nsignificantly surpasses the existing methods in terms of detection accuracy and\nexplainability.\n","authors":["Daesoo Lee","Sara Malacarne","Erlend Aune"],"pdf_url":"https://arxiv.org/pdf/2311.12550v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12538v1","updated":"2023-11-21T11:33:03Z","published":"2023-11-21T11:33:03Z","title":"In-Context Learning Functions with Varying Number of Minima","summary":" Large Language Models (LLMs) have proven effective at In-Context Learning\n(ICL), an ability that allows them to create predictors from labeled examples.\nFew studies have explored the interplay between ICL and specific properties of\nfunctions it attempts to approximate. In our study, we use a formal framework\nto explore ICL and propose a new task of approximating functions with varying\nnumber of minima. We implement a method that allows for producing functions\nwith given inputs as minima. We find that increasing the number of minima\ndegrades ICL performance. At the same time, our evaluation shows that ICL\noutperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster\nthan 2NN in all settings. We validate the findings through a set of few-shot\nexperiments across various hyperparameter configurations.\n","authors":["David Oniani","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11899v3","updated":"2023-11-21T11:24:29Z","published":"2023-01-27T18:23:10Z","title":"Is TinyML Sustainable? Assessing the Environmental Impacts of Machine\n Learning on Microcontrollers","summary":" The sustained growth of carbon emissions and global waste elicits significant\nsustainability concerns for our environment's future. The growing Internet of\nThings (IoT) has the potential to exacerbate this issue. However, an emerging\narea known as Tiny Machine Learning (TinyML) has the opportunity to help\naddress these environmental challenges through sustainable computing practices.\nTinyML, the deployment of machine learning (ML) algorithms onto low-cost,\nlow-power microcontroller systems, enables on-device sensor analytics that\nunlocks numerous always-on ML applications. This article discusses both the\npotential of these TinyML applications to address critical sustainability\nchallenges, as well as the environmental footprint of this emerging technology.\nThrough a complete life cycle analysis (LCA), we find that TinyML systems\npresent opportunities to offset their carbon emissions by enabling applications\nthat reduce the emissions of other sectors. Nevertheless, when globally scaled,\nthe carbon footprint of TinyML systems is not negligible, necessitating that\ndesigners factor in environmental impact when formulating new devices. Finally,\nwe outline research directions to enable further sustainable contributions of\nTinyML.\n","authors":["Shvetank Prakash","Matthew Stewart","Colby Banbury","Mark Mazumder","Pete Warden","Brian Plancher","Vijay Janapa Reddi"],"pdf_url":"https://arxiv.org/pdf/2301.11899v3.pdf","comment":"Communications of the ACM (CACM) November 2023 Issue"},{"id":"http://arxiv.org/abs/2311.12530v1","updated":"2023-11-21T11:21:53Z","published":"2023-11-21T11:21:53Z","title":"An efficient likelihood-free Bayesian inference method based on\n sequential neural posterior estimation","summary":" Sequential neural posterior estimation (SNPE) techniques have been recently\nproposed for dealing with simulation-based models with intractable likelihoods.\nUnlike approximate Bayesian computation, SNPE techniques learn the posterior\nfrom sequential simulation using neural network-based conditional density\nestimators. This paper reclaims SNPE-B proposed by Lueckmann et al. (2017),\nwhich suffers from inefficiency and slow inference due to inefficient\nutilization of simulated data and high variance of parameter updates. To\naddress these issues, we firstly introduce a concentrated loss function based\non an adaptive calibration kernel that reweights the simulated data\nappropriately to improve the data efficiency. Moreover, we provide a\ntheoretical analysis of the variance of associated Monte Carlo estimators.\nBased on this analysis, we then propose several variance reduction techniques\nto further accelerate the process of learning. Numerical experiments\ndemonstrate that our method outperforms the original method together with other\nexisting competitors on certain tasks.\n","authors":["Yifei Xiong","Xiliang Yang","Sanguo Zhang","Zhijian He"],"pdf_url":"https://arxiv.org/pdf/2311.12530v1.pdf","comment":"29 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.12528v1","updated":"2023-11-21T11:15:14Z","published":"2023-11-21T11:15:14Z","title":"Inverse Problems with Learned Forward Operators","summary":" Solving inverse problems requires knowledge of the forward operator, but\naccurate models can be computationally expensive and hence cheaper variants are\ndesired that do not compromise reconstruction quality. This chapter reviews\nreconstruction methods in inverse problems with learned forward operators that\nfollow two different paradigms. The first one is completely agnostic to the\nforward operator and learns its restriction to the subspace spanned by the\ntraining data. The framework of regularisation by projection is then used to\nfind a reconstruction. The second one uses a simplified model of the physics of\nthe measurement process and only relies on the training data to learn a model\ncorrection. We present the theory of these two approaches and compare them\nnumerically. A common theme emerges: both methods require, or at least benefit\nfrom, training data not only for the forward operator, but also for its\nadjoint.\n","authors":["Simon Arridge","Andreas Hauptmann","Yury Korolev"],"pdf_url":"https://arxiv.org/pdf/2311.12528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12526v1","updated":"2023-11-21T11:12:03Z","published":"2023-11-21T11:12:03Z","title":"Neural Network Pruning by Gradient Descent","summary":" The rapid increase in the parameters of deep learning models has led to\nsignificant costs, challenging computational efficiency and model\ninterpretability. In this paper, we introduce a novel and straightforward\nneural network pruning framework that incorporates the Gumbel-Softmax\ntechnique. This framework enables the simultaneous optimization of a network's\nweights and topology in an end-to-end process using stochastic gradient\ndescent. Empirical results demonstrate its exceptional compression capability,\nmaintaining high accuracy on the MNIST dataset with only 0.15\\% of the original\nnetwork parameters. Moreover, our framework enhances neural network\ninterpretability, not only by allowing easy extraction of feature importance\ndirectly from the pruned network but also by enabling visualization of feature\nsymmetry and the pathways of information propagation from features to outcomes.\nAlthough the pruning strategy is learned through deep learning, it is\nsurprisingly intuitive and understandable, focusing on selecting key\nrepresentative features and exploiting data patterns to achieve extreme sparse\npruning. We believe our method opens a promising new avenue for deep learning\npruning and the creation of interpretable machine learning systems.\n","authors":["Zhang Zhang","Ruyi Tao","Jiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12526v1.pdf","comment":"21 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.06255v2","updated":"2023-11-21T11:11:57Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multi-modal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multi-modal cooperation, which could not jointly\nutilize all modalities well. Some methods are proposed to identify and enhance\nthe worse learnt modality, but are often hard to provide the fine-grained\nobservation of multi-modal cooperation at sample-level with theoretical\nsupport. Hence, it is essential to reasonably observe and improve the\nfine-grained cooperation between modalities, especially when facing realistic\nscenarios where the modality discrepancy could vary across different samples.\nTo this end, we introduce a fine-grained modality valuation metric to evaluate\nthe contribution of each modality at sample-level. Via modality valuation, we\nregretfully observe that the multi-modal model tends to rely on one specific\nmodality, resulting in other modalities being low-contributing. We further\nanalyze this issue and improve cooperation between modalities by enhancing the\ndiscriminative ability of low-contributing modalities in a targeted manner.\nOverall, our methods reasonably observe the fine-grained uni-modal contribution\nat sample-level and achieve considerable improvement on different multi-modal\nmodels.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2311.12524v1","updated":"2023-11-21T11:09:57Z","published":"2023-11-21T11:09:57Z","title":"ALPHA: AnomaLous Physiological Health Assessment Using Large Language\n Models","summary":" This study concentrates on evaluating the efficacy of Large Language Models\n(LLMs) in healthcare, with a specific focus on their application in personal\nanomalous health monitoring. Our research primarily investigates the\ncapabilities of LLMs in interpreting and analyzing physiological data obtained\nfrom FDA-approved devices. We conducted an extensive analysis using anomalous\nphysiological data gathered in a simulated low-air-pressure plateau\nenvironment. This allowed us to assess the precision and reliability of LLMs in\nunderstanding and evaluating users' health status with notable specificity. Our\nfindings reveal that LLMs exhibit exceptional performance in determining\nmedical indicators, including a Mean Absolute Error (MAE) of less than 1 beat\nper minute for heart rate and less than 1% for oxygen saturation (SpO2).\nFurthermore, the Mean Absolute Percentage Error (MAPE) for these evaluations\nremained below 1%, with the overall accuracy of health assessments surpassing\n85%. In image analysis tasks, such as interpreting photoplethysmography (PPG)\ndata, our specially adapted GPT models demonstrated remarkable proficiency,\nachieving less than 1 bpm error in cycle count and 7.28 MAE for heart rate\nestimation. This study highlights LLMs' dual role as health data analysis tools\nand pivotal elements in advanced AI health assistants, offering personalized\nhealth insights and recommendations within the future health assistant\nframework.\n","authors":["Jiankai Tang","Kegang Wang","Hongming Hu","Xiyuxing Zhang","Peiyu Wang","Xin Liu","Yuntao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12524v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05587v2","updated":"2023-11-21T10:38:16Z","published":"2023-11-09T18:47:33Z","title":"Bayesian Methods for Media Mix Modelling with shape and funnel effects","summary":" In recent years, significant progress in generative AI has highlighted the\nimportant role of physics-inspired models that utilize advanced mathematical\nconcepts based on fundamental physics principles to enhance artificial\nintelligence capabilities. Among these models, those based on diffusion\nequations have greatly improved image quality. This study aims to explore the\npotential uses of Maxwell-Boltzmann equation, which forms the basis of the\nkinetic theory of gases, and the Michaelis-Menten model in Marketing Mix\nModelling (MMM) applications. We propose incorporating these equations into\nHierarchical Bayesian models to analyse consumer behaviour in the context of\nadvertising. These equation sets excel in accurately describing the random\ndynamics in complex systems like social interactions and consumer-advertising\ninteractions.\n","authors":["Javier Marin"],"pdf_url":"https://arxiv.org/pdf/2311.05587v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12501v1","updated":"2023-11-21T10:20:34Z","published":"2023-11-21T10:20:34Z","title":"Fair Polylog-Approximate Low-Cost Hierarchical Clustering","summary":" Research in fair machine learning, and particularly clustering, has been\ncrucial in recent years given the many ethical controversies that modern\nintelligent systems have posed. Ahmadian et al. [2020] established the study of\nfairness in \\textit{hierarchical} clustering, a stronger, more structured\nvariant of its well-known flat counterpart, though their proposed algorithm\nthat optimizes for Dasgupta's [2016] famous cost function was highly\ntheoretical. Knittel et al. [2023] then proposed the first practical fair\napproximation for cost, however they were unable to break the\npolynomial-approximate barrier they posed as a hurdle of interest. We break\nthis barrier, proposing the first truly polylogarithmic-approximate low-cost\nfair hierarchical clustering, thus greatly bridging the gap between the best\nfair and vanilla hierarchical clustering approximations.\n","authors":["Marina Knittel","Max Springer","John Dickerson","MohammadTaghi Hajiaghayi"],"pdf_url":"https://arxiv.org/pdf/2311.12501v1.pdf","comment":"Accepted to NeurIPS '23 (16 pages, 5 figures)"},{"id":"http://arxiv.org/abs/2311.12495v1","updated":"2023-11-21T10:11:19Z","published":"2023-11-21T10:11:19Z","title":"Multi-Objective Reinforcement Learning based on Decomposition: A\n taxonomy and framework","summary":" Multi-objective reinforcement learning (MORL) extends traditional RL by\nseeking policies making different compromises among conflicting objectives. The\nrecent surge of interest in MORL has led to diverse studies and solving\nmethods, often drawing from existing knowledge in multi-objective optimization\nbased on decomposition (MOO/D). Yet, a clear categorization based on both RL\nand MOO/D is lacking in the existing literature. Consequently, MORL researchers\nface difficulties when trying to classify contributions within a broader\ncontext due to the absence of a standardized taxonomy. To tackle such an issue,\nthis paper introduces Multi-Objective Reinforcement Learning based on\nDecomposition (MORL/D), a novel methodology bridging RL and MOO literature. A\ncomprehensive taxonomy for MORL/D is presented, providing a structured\nfoundation for categorizing existing and potential MORL works. The introduced\ntaxonomy is then used to scrutinize MORL research, enhancing clarity and\nconciseness through well-defined categorization. Moreover, a flexible framework\nderived from the taxonomy is introduced. This framework accommodates diverse\ninstantiations using tools from both RL and MOO/D. Implementation across\nvarious configurations demonstrates its versatility, assessed against benchmark\nproblems. Results indicate MORL/D instantiations achieve comparable performance\nwith significantly greater versatility than current state-of-the-art\napproaches. By presenting the taxonomy and framework, this paper offers a\ncomprehensive perspective and a unified vocabulary for MORL. This not only\nfacilitates the identification of algorithmic contributions but also lays the\ngroundwork for novel research avenues in MORL, contributing to the continued\nadvancement of this field.\n","authors":["Florian Felten","El-Ghazali Talbi","Grégoire Danoy"],"pdf_url":"https://arxiv.org/pdf/2311.12495v1.pdf","comment":"Under review at JAIR"},{"id":"http://arxiv.org/abs/2311.12491v1","updated":"2023-11-21T10:05:32Z","published":"2023-11-21T10:05:32Z","title":"Heuristics for Detecting CoinJoin Transactions on the Bitcoin Blockchain","summary":" This research delves into the intricacies of Bitcoin, a decentralized\npeer-to-peer network, and its associated blockchain, which records all\ntransactions since its inception. While this ensures integrity and\ntransparency, the transparent nature of Bitcoin potentially compromises users'\nprivacy rights. To address this concern, users have adopted CoinJoin, a method\nthat amalgamates multiple transaction intents into a single, larger transaction\nto bolster transactional privacy. This process complicates individual\ntransaction tracing and disrupts many established blockchain analysis\nheuristics. Despite its significance, limited research has been conducted on\nidentifying CoinJoin transactions. Particularly noteworthy are varied CoinJoin\nimplementations such as JoinMarket, Wasabi, and Whirlpool, each presenting\ndistinct challenges due to their unique transaction structures. This study\ndelves deeply into the open-source implementations of these protocols, aiming\nto develop refined heuristics for identifying their transactions on the\nblockchain. Our exhaustive analysis covers transactions up to block 760,000,\noffering a comprehensive insight into CoinJoin transactions and their\nimplications for Bitcoin blockchain analysis.\n","authors":["Hugo Schnoering","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2311.12491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.05879v2","updated":"2023-11-21T10:04:37Z","published":"2021-08-12T17:53:47Z","title":"Feature Engineering with Regularity Structures","summary":" We investigate the use of models from the theory of regularity structures as\nfeatures in machine learning tasks. A model is a polynomial function of a\nspace-time signal designed to well-approximate solutions to partial\ndifferential equations (PDEs), even in low regularity regimes. Models can be\nseen as natural multi-dimensional generalisations of signatures of paths; our\nwork therefore aims to extend the recent use of signatures in data science\nbeyond the context of time-ordered data. We provide a flexible definition of a\nmodel feature vector associated to a space-time signal, along with two\nalgorithms which illustrate ways in which these features can be combined with\nlinear regression. We apply these algorithms in several numerical experiments\ndesigned to learn solutions to PDEs with a given forcing and boundary data. Our\nexperiments include semi-linear parabolic and wave equations with forcing, and\nBurgers' equation with no forcing. We find an advantage in favour of our\nalgorithms when compared to several alternative methods. Additionally, in the\nexperiment with Burgers' equation, we find non-trivial predictive power when\nnoise is added to the observations.\n","authors":["Ilya Chevyrev","Andris Gerasimovics","Hendrik Weber"],"pdf_url":"https://arxiv.org/pdf/2108.05879v2.pdf","comment":"33 pages, 7 figures, 7 tables. Improved presentation of model feature\n vector (Section 2) and experiments (Section 3). Added new experiment in 2D\n spatial domain (Section 3.1.2). To appear in Journal of Scientific Computing"},{"id":"http://arxiv.org/abs/2311.12490v1","updated":"2023-11-21T10:01:08Z","published":"2023-11-21T10:01:08Z","title":"Hyb-NeRF: A Multiresolution Hybrid Encoding for Neural Radiance Fields","summary":" Recent advances in Neural radiance fields (NeRF) have enabled high-fidelity\nscene reconstruction for novel view synthesis. However, NeRF requires hundreds\nof network evaluations per pixel to approximate a volume rendering integral,\nmaking it slow to train. Caching NeRFs into explicit data structures can\neffectively enhance rendering speed but at the cost of higher memory usage. To\naddress these issues, we present Hyb-NeRF, a novel neural radiance field with a\nmulti-resolution hybrid encoding that achieves efficient neural modeling and\nfast rendering, which also allows for high-quality novel view synthesis. The\nkey idea of Hyb-NeRF is to represent the scene using different encoding\nstrategies from coarse-to-fine resolution levels. Hyb-NeRF exploits\nmemory-efficiency learnable positional features at coarse resolutions and the\nfast optimization speed and local details of hash-based feature grids at fine\nresolutions. In addition, to further boost performance, we embed cone\ntracing-based features in our learnable positional encoding that eliminates\nencoding ambiguity and reduces aliasing artifacts. Extensive experiments on\nboth synthetic and real-world datasets show that Hyb-NeRF achieves faster\nrendering speed with better rending quality and even a lower memory footprint\nin comparison to previous state-of-the-art methods.\n","authors":["Yifan Wang","Yi Gong","Yuan Zeng"],"pdf_url":"https://arxiv.org/pdf/2311.12490v1.pdf","comment":"WACV2024"},{"id":"http://arxiv.org/abs/2205.02645v3","updated":"2023-11-21T09:48:28Z","published":"2022-05-05T13:44:24Z","title":"PyDaddy: A Python package for discovering stochastic dynamical equations\n from timeseries data","summary":" Stochastic differential equations (SDEs) are an important framework to model\ndynamics with randomness, as is common in most biological systems. The inverse\nproblem of integrating these models with empirical data remains a major\nchallenge. Here, we present a software package, PyDaDDy (Python Library for\nData Driven Dynamics) that takes time series data as an input and outputs an\ninterpretable SDE. We achieve this by combining traditional approaches from\nstochastic calculus literature with state-of-the-art equation discovery\ntechniques. We validate our approach on synthetic datasets, and demonstrate the\ngenerality and applicability of the method on two real-world datasets of vastly\ndifferent spatiotemporal scales: (i) collective movement of fish school where\nstochasticity plays a crucial role, and (ii) confined migration of a single\ncell, primarily following a relaxed oscillation. We make the method available\nas an easy-to-use, open-source Python package, PyDaddy (Python Library for Data\nDriven Dynamics).\n","authors":["Arshed Nabeel","Ashwin Karichannavar","Shuaib Palathingal","Jitesh Jhawar","David B. Brückner","Danny Raj M.","Vishwesha Guttal"],"pdf_url":"https://arxiv.org/pdf/2205.02645v3.pdf","comment":"15 pages (+ 9 page appendix), 6 figures (+ 8 appendix figures)"},{"id":"http://arxiv.org/abs/2211.15513v2","updated":"2023-11-21T09:42:12Z","published":"2022-11-25T09:41:07Z","title":"Composite Score for Anomaly Detection in Imbalanced Real-World\n Industrial Dataset","summary":" In recent years, the industrial sector has evolved towards its fourth\nrevolution. The quality control domain is particularly interested in advanced\nmachine learning for computer vision anomaly detection. Nevertheless, several\nchallenges have to be faced, including imbalanced datasets, the image\ncomplexity, and the zero-false-negative (ZFN) constraint to guarantee the\nhigh-quality requirement. This paper illustrates a use case for an industrial\npartner, where Printed Circuit Board Assembly (PCBA) images are first\nreconstructed with a Vector Quantized Generative Adversarial Network (VQGAN)\ntrained on normal products. Then, several multi-level metrics are extracted on\na few normal and abnormal images, highlighting anomalies through reconstruction\ndifferences. Finally, a classifer is trained to build a composite anomaly score\nthanks to the metrics extracted. This three-step approach is performed on the\npublic MVTec-AD datasets and on the partner PCBA dataset, where it achieves a\nregular accuracy of 95.69% and 87.93% under the ZFN constraint.\n","authors":["Arnaud Bougaham","Mohammed El Adoui","Isabelle Linden","Benoît Frénay"],"pdf_url":"https://arxiv.org/pdf/2211.15513v2.pdf","comment":"This version of the article has been accepted for publication, after\n peer review and is subject to Springer Nature AM terms of use, but is not the\n Version of Record and does not reflect post-acceptance improvements, or any\n corrections. The Version of Record is available online at:\n https://doi.org/10.1007/s10994-023-06415-9"},{"id":"http://arxiv.org/abs/2311.12476v1","updated":"2023-11-21T09:37:49Z","published":"2023-11-21T09:37:49Z","title":"MaskFlow: Object-Aware Motion Estimation","summary":" We introduce a novel motion estimation method, MaskFlow, that is capable of\nestimating accurate motion fields, even in very challenging cases with small\nobjects, large displacements and drastic appearance changes. In addition to\nlower-level features, that are used in other Deep Neural Network (DNN)-based\nmotion estimation methods, MaskFlow draws from object-level features and\nsegmentations. These features and segmentations are used to approximate the\nobjects' translation motion field. We propose a novel and effective way of\nincorporating the incomplete translation motion field into a subsequent motion\nestimation network for refinement and completion. We also produced a new\nchallenging synthetic dataset with motion field ground truth, and also provide\nextra ground truth for the object-instance matchings and corresponding\nsegmentation masks. We demonstrate that MaskFlow outperforms state of the art\nmethods when evaluated on our new challenging dataset, whilst still producing\ncomparable results on the popular FlyingThings3D benchmark dataset.\n","authors":["Aria Ahmadi","David R. Walton","Tim Atherton","Cagatay Dikici"],"pdf_url":"https://arxiv.org/pdf/2311.12476v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00018v2","updated":"2023-11-21T09:22:28Z","published":"2023-08-31T07:53:02Z","title":"Unsupervised discovery of Interpretable Visual Concepts","summary":" Providing interpretability of deep-learning models to non-experts, while\nfundamental for a responsible real-world usage, is challenging. Attribution\nmaps from xAI techniques, such as Integrated Gradients, are a typical example\nof a visualization technique containing a high level of information, but with\ndifficult interpretation. In this paper, we propose two methods, Maximum\nActivation Groups Extraction (MAGE) and Multiscale Interpretable Visualization\n(Ms-IV), to explain the model's decision, enhancing global interpretability.\nMAGE finds, for a given CNN, combinations of features which, globally, form a\nsemantic meaning, that we call concepts. We group these similar feature\npatterns by clustering in ``concepts'', that we visualize through Ms-IV. This\nlast method is inspired by Occlusion and Sensitivity analysis (incorporating\ncausality), and uses a novel metric, called Class-aware Order Correlation\n(CaOC), to globally evaluate the most important image regions according to the\nmodel's decision space. We compare our approach to xAI methods such as LIME and\nIntegrated Gradients. Experimental results evince the Ms-IV higher localization\nand faithfulness values. Finally, qualitative evaluation of combined MAGE and\nMs-IV demonstrates humans' ability to agree, based on the visualization, with\nthe decision of clusters' concepts; and, to detect, among a given set of\nnetworks, the existence of bias.\n","authors":["Caroline Mazini Rodrigues","Nicolas Boutry","Laurent Najman"],"pdf_url":"https://arxiv.org/pdf/2309.00018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06617v3","updated":"2023-11-21T09:20:33Z","published":"2022-11-12T09:41:02Z","title":"Empirical Risk Minimization with Relative Entropy Regularization","summary":" The empirical risk minimization (ERM) problem with relative entropy\nregularization (ERM-RER) is investigated under the assumption that the\nreference measure is a {\\sigma}-finite measure, and not necessarily a\nprobability measure. Under this assumption, which leads to a generalization of\nthe ERM-RER problem allowing a larger degree of flexibility for incorporating\nprior knowledge, numerous relevant properties are stated. Among these\nproperties, the solution to this problem, if it exists, is shown to be a unique\nprobability measure, often mutually absolutely continuous with the reference\nmeasure. Such a solution exhibits a probably-approximately-correct guarantee\nfor the ERM problem independently of whether the latter possesses a solution.\nFor a fixed dataset, the empirical risk is shown to be a sub-Gaussian random\nvariable when the models are sampled from the solution to the ERM-RER problem.\nThe generalization capabilities of the solution to the ERM-RER problem (the\nGibbs algorithm) are studied via the sensitivity of the expected empirical risk\nto deviations from such a solution towards alternative probability measures.\nFinally, an interesting connection between sensitivity, generalization error,\nand lautum information is established\n","authors":["Samir M. Perlaza","Gaetan Bisson","Iñaki Esnaola","Alain Jean-Marie","Stefano Rini"],"pdf_url":"https://arxiv.org/pdf/2211.06617v3.pdf","comment":"Submitted to the the Transactions on Information Theory on June 12,\n 2023. Also available as: Research Report, INRIA, No. RR-9454, Centre Inria\n d'Universit\\'e C\\^ote d'Azur, Sophia Antipolis, France, Feb., 2022 This\n version contains the revision for Transactions on Information Theory on\n November 21, 2023"},{"id":"http://arxiv.org/abs/2310.20567v2","updated":"2023-11-21T09:19:06Z","published":"2023-10-31T15:56:17Z","title":"One-shot backpropagation for multi-step prediction in physics-based\n system identification -- EXTENDED VERSION","summary":" The aim of this paper is to present a novel physics-based framework for the\nidentification of dynamical systems, in which the physical and structural\ninsights are reflected directly into a backpropagation-based learning\nalgorithm. The main result is a method to compute in closed form the gradient\nof a multi-step loss function, while enforcing physical properties and\nconstraints. The derived algorithm has been exploited to identify the unknown\ninertia matrix of a space debris, and the results show the reliability of the\nmethod in capturing the physical adherence of the estimated parameters.\n","authors":["Cesare Donati","Martina Mammarella","Fabrizio Dabbene","Carlo Novara","Constantino Lagoa"],"pdf_url":"https://arxiv.org/pdf/2310.20567v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18168v3","updated":"2023-11-21T09:19:03Z","published":"2023-10-27T14:27:43Z","title":"Personas as a Way to Model Truthfulness in Language Models","summary":" Large Language Models (LLMs) are trained on vast amounts of text from the\ninternet, which contains both factual and misleading information about the\nworld. Can language models discern truth from falsehood in this contradicting\ndata? Expanding on the view that LLMs can model different communicative agents,\nwe present the persona hypothesis: LLMs can cluster agents into personas using\ncommon features of their generations. For instance, a truthful persona is a\ngroup of agents that are likely to produce truthful text and that share similar\nfeatures like formal writing styles and scientific references. By modeling this\npersona, LLMs can generalize truthfulness beyond the specific contexts in which\neach agent generated the training text. For example, the model can infer that\nthe agent ``Wikipedia'' will behave truthfully on topics that were only\ngenerated by ``Science'' because they both belong to the truthful persona. We\nshow evidence for the persona hypothesis via two observations: (1) we can probe\nwhether a model's answer will be truthful before it is generated; (2)\nfinetuning a model on a set of facts improves its truthfulness on unseen\ntopics. Next, using arithmetics as a synthetic environment, we show that\nlanguage models can separate true and false statements, and generalize\ntruthfulness across agents; but only if agents in the training data share a\ntruthful generative process that enables the creation of a truthful persona.\nOverall, our findings suggest that models can exploit hierarchical structures\nin the data to learn abstract concepts like truthfulness.\n","authors":["Nitish Joshi","Javier Rando","Abulhair Saparov","Najoung Kim","He He"],"pdf_url":"https://arxiv.org/pdf/2310.18168v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18183v2","updated":"2023-11-21T09:11:38Z","published":"2023-05-29T16:20:23Z","title":"On Counterfactual Data Augmentation Under Confounding","summary":" Counterfactual data augmentation has recently emerged as a method to mitigate\nconfounding biases in the training data. These biases, such as spurious\ncorrelations, arise due to various observed and unobserved confounding\nvariables in the data generation process. In this paper, we formally analyze\nhow confounding biases impact downstream classifiers and present a causal\nviewpoint to the solutions based on counterfactual data augmentation. We\nexplore how removing confounding biases serves as a means to learn invariant\nfeatures, ultimately aiding in generalization beyond the observed data\ndistribution. Additionally, we present a straightforward yet powerful algorithm\nfor generating counterfactual images, which effectively mitigates the influence\nof confounding effects on downstream classifiers. Through experiments on MNIST\nvariants and the CelebA datasets, we demonstrate how our simple augmentation\nmethod helps existing state-of-the-art methods achieve good results.\n","authors":["Abbavaram Gowtham Reddy","Saketh Bachu","Saloni Dash","Charchit Sharma","Amit Sharma","Vineeth N Balasubramanian"],"pdf_url":"https://arxiv.org/pdf/2305.18183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01959v2","updated":"2023-11-21T09:01:17Z","published":"2023-10-03T11:10:21Z","title":"Beyond Labeling Oracles: What does it mean to steal ML models?","summary":" Model extraction attacks are designed to steal trained models with only query\naccess, as is often provided through APIs that ML-as-a-Service providers offer.\nML models are expensive to train, in part because data is hard to obtain, and a\nprimary incentive for model extraction is to acquire a model while incurring\nless cost than training from scratch. Literature on model extraction commonly\nclaims or presumes that the attacker is able to save on both data acquisition\nand labeling costs. We show that the attacker often does not. This is because\ncurrent attacks implicitly rely on the adversary being able to sample from the\nvictim model's data distribution. We thoroughly evaluate factors influencing\nthe success of model extraction. We discover that prior knowledge of the\nattacker, i.e. access to in-distribution data, dominates other factors like the\nattack policy the adversary follows to choose which queries to make to the\nvictim model API. Thus, an adversary looking to develop an equally capable\nmodel with a fixed budget has little practical incentive to perform model\nextraction, since for the attack to work they need to collect in-distribution\ndata, saving only on the cost of labeling. With low labeling costs in the\ncurrent market, the usefulness of such attacks is questionable. Ultimately, we\ndemonstrate that the effect of prior knowledge needs to be explicitly decoupled\nfrom the attack policy. To this end, we propose a benchmark to evaluate attack\npolicy directly.\n","authors":["Avital Shafran","Ilia Shumailov","Murat A. Erdogdu","Nicolas Papernot"],"pdf_url":"https://arxiv.org/pdf/2310.01959v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12439v1","updated":"2023-11-21T08:51:58Z","published":"2023-11-21T08:51:58Z","title":"Harnessing FPGA Technology for Enhanced Biomedical Computation","summary":" This research delves into sophisticated neural network frameworks like\nConvolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), Long\nShort-Term Memory Networks (LSTMs), and Deep Belief Networks (DBNs) for\nimproved analysis of ECG signals via Field Programmable Gate Arrays (FPGAs).\nThe MIT-BIH Arrhythmia Database serves as the foundation for training and\nevaluating our models, with added Gaussian noise to heighten the algorithms'\nresilience. The developed architectures incorporate various layers for specific\nprocessing and categorization functions, employing strategies such as the\nEarlyStopping callback and Dropout layer to prevent overfitting. Additionally,\nthis paper details the creation of a tailored Tensor Compute Unit (TCU)\naccelerator for the PYNQ Z1 platform. It provides a thorough methodology for\nimplementing FPGA-based machine learning, encompassing the configuration of the\nTensil toolchain in Docker, selection of architectures, PS-PL configuration,\nand the compilation and deployment of models. By evaluating performance\nindicators like latency and throughput, we showcase the efficacy of FPGAs in\nadvanced biomedical computing. This study ultimately serves as a comprehensive\nguide to optimizing neural network operations on FPGAs across various fields.\n","authors":["Nisanur Alici","Kayode Inadagbo","Murat Isik"],"pdf_url":"https://arxiv.org/pdf/2311.12439v1.pdf","comment":"Submitted to IEEE Transactions on Biomedical Circuits and Systems.\n arXiv admin note: substantial text overlap with arXiv:2307.07914"},{"id":"http://arxiv.org/abs/2310.08897v2","updated":"2023-11-21T08:51:03Z","published":"2023-10-13T06:58:52Z","title":"Self supervised convolutional kernel based handcrafted feature\n harmonization: Enhanced left ventricle hypertension disease phenotyping on\n echocardiography","summary":" Radiomics, a medical imaging technique, extracts quantitative handcrafted\nfeatures from images to predict diseases. Harmonization in those features\nensures consistent feature extraction across various imaging devices and\nprotocols. Methods for harmonization include standardized imaging protocols,\nstatistical adjustments, and evaluating feature robustness. Myocardial diseases\nsuch as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)\nare diagnosed via echocardiography, but variable imaging settings pose\nchallenges. Harmonization techniques are crucial for applying handcrafted\nfeatures in disease diagnosis in such scenario. Self-supervised learning (SSL)\nenhances data understanding within limited datasets and adapts to diverse data\nsettings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying\nsuperior performance in various tasks. This study focuses on convolutional\nfilters within SSL, using them as preprocessing to convert images into feature\nmaps for handcrafted feature harmonization. Our proposed method excelled in\nharmonization evaluation and exhibited superior LVH classification performance\ncompared to existing methods.\n","authors":["Jina Lee","Youngtaek Hong","Dawun Jeong","Yeonggul Jang","Sihyeon Jeong","Taekgeun Jung","Yeonyee E. Yoon","Inki Moon","Seung-Ah Lee","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2310.08897v2.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.12436v1","updated":"2023-11-21T08:45:09Z","published":"2023-11-21T08:45:09Z","title":"Classifier Calibration with ROC-Regularized Isotonic Regression","summary":" Calibration of machine learning classifiers is necessary to obtain reliable\nand interpretable predictions, bridging the gap between model confidence and\nactual probabilities. One prominent technique, isotonic regression (IR), aims\nat calibrating binary classifiers by minimizing the cross entropy on a\ncalibration set via monotone transformations. IR acts as an adaptive binning\nprocedure, which allows achieving a calibration error of zero, but leaves open\nthe issue of the effect on performance. In this paper, we first prove that IR\npreserves the convex hull of the ROC curve -- an essential performance metric\nfor binary classifiers. This ensures that a classifier is calibrated while\ncontrolling for overfitting of the calibration set. We then present a novel\ngeneralization of isotonic regression to accommodate classifiers with K\nclasses. Our method constructs a multidimensional adaptive binning scheme on\nthe probability simplex, again achieving a multi-class calibration error equal\nto zero. We regularize this algorithm by imposing a form of monotony that\npreserves the K-dimensional ROC surface of the classifier. We show empirically\nthat this general monotony criterion is effective in striking a balance between\nreducing cross entropy loss and avoiding overfitting of the calibration set.\n","authors":["Eugene Berta","Francis Bach","Michael Jordan"],"pdf_url":"https://arxiv.org/pdf/2311.12436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12435v1","updated":"2023-11-21T08:44:38Z","published":"2023-11-21T08:44:38Z","title":"Fair Enough? A map of the current limitations of the requirements to\n have \"fair'' algorithms","summary":" In the recent years, the raise in the usage and efficiency of Artificial\nIntelligence and, more in general, of Automated Decision-Making systems has\nbrought with it an increasing and welcome awareness of the risks associated\nwith such systems. One of such risks is that of perpetuating or even amplifying\nbias and unjust disparities present in the data from which many of these\nsystems learn to adjust and optimise their decisions. This awareness has on one\nside encouraged several scientific communities to come up with more and more\nappropriate ways and methods to assess, quantify, and possibly mitigate such\nbiases and disparities. On the other hand, it has prompted more and more layers\nof society, including policy makers, to call for ``fair'' algorithms. We\nbelieve that while a lot of excellent and multidisciplinary research is\ncurrently being conducted, what is still fundamentally missing is the awareness\nthat having ``fair'' algorithms is per s\\'e a nearly meaningless requirement,\nthat needs to be complemented with a lot of additional societal choices to\nbecome actionable. Namely, there is a hiatus between what the society is\ndemanding from Automated Decision-Making systems, and what this demand actually\nmeans in real-world scenarios. In this work, we outline the key features of\nsuch a hiatus, and pinpoint a list of fundamental ambiguities and attention\npoints that we as a society must address in order to give a concrete meaning to\nthe increasing demand of fairness in Automated Decision-Making systems.\n","authors":["Alessandro Castelnovo","Nicole Inverardi","Gabriele Nanino","Ilaria Giuseppina Penco","Daniele Regoli"],"pdf_url":"https://arxiv.org/pdf/2311.12435v1.pdf","comment":"20 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2208.13405v4","updated":"2023-11-21T08:41:31Z","published":"2022-08-29T07:36:17Z","title":"Interpreting Black-box Machine Learning Models for High Dimensional\n Datasets","summary":" Deep neural networks (DNNs) have been shown to outperform traditional machine\nlearning algorithms in a broad variety of application domains due to their\neffectiveness in modeling complex problems and handling high-dimensional\ndatasets. Many real-life datasets, however, are of increasingly high\ndimensionality, where a large number of features may be irrelevant for both\nsupervised and unsupervised learning tasks. The inclusion of such features\nwould not only introduce unwanted noise but also increase computational\ncomplexity. Furthermore, due to high non-linearity and dependency among a large\nnumber of features, DNN models tend to be unavoidably opaque and perceived as\nblack-box methods because of their not well-understood internal functioning.\nTheir algorithmic complexity is often simply beyond the capacities of humans to\nunderstand the interplay among myriads of hyperparameters. A well-interpretable\nmodel can identify statistically significant features and explain the way they\naffect the model's outcome. In this paper, we propose an efficient method to\nimprove the interpretability of black-box models for classification tasks in\nthe case of high-dimensional datasets. First, we train a black-box model on a\nhigh-dimensional dataset to learn the embeddings on which the classification is\nperformed. To decompose the inner working principles of the black-box model and\nto identify top-k important features, we employ different probing and\nperturbing techniques. We then approximate the behavior of the black-box model\nby means of an interpretable surrogate model on the top-k feature space.\nFinally, we derive decision rules and local explanations from the surrogate\nmodel to explain individual decisions. Our approach outperforms\nstate-of-the-art methods like TabNet and XGboost when tested on different\ndatasets with varying dimensionality between 50 and 20,000 w.r.t metrics and\nexplainability.\n","authors":["Md. Rezaul Karim","Md. Shajalal","Alex Graß","Till Döhmen","Sisay Adugna Chala","Alexander Boden","Christian Beecks","Stefan Decker"],"pdf_url":"https://arxiv.org/pdf/2208.13405v4.pdf","comment":"This paper is currently under review in a journal"},{"id":"http://arxiv.org/abs/2311.12424v1","updated":"2023-11-21T08:32:38Z","published":"2023-11-21T08:32:38Z","title":"Looped Transformers are Better at Learning Learning Algorithms","summary":" Transformers have demonstrated effectiveness in \\emph{in-context solving}\ndata-fitting problems from various (latent) models, as reported by Garg et al.\nHowever, the absence of an inherent iterative structure in the transformer\narchitecture presents a challenge in emulating the iterative algorithms, which\nare commonly employed in traditional machine learning methods. To address this,\nwe propose the utilization of \\emph{looped} transformer architecture and its\nassociated training methodology, with the aim of incorporating iterative\ncharacteristics into the transformer architectures. Experimental results\nsuggest that the looped transformer achieves performance comparable to the\nstandard transformer in solving various data-fitting problems, while utilizing\nless than 10\\% of the parameter count.\n","authors":["Liu Yang","Kangwook Lee","Robert Nowak","Dimitris Papailiopoulos"],"pdf_url":"https://arxiv.org/pdf/2311.12424v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.05077v4","updated":"2023-11-21T08:29:16Z","published":"2022-06-10T13:18:26Z","title":"Tensor Train for Global Optimization Problems in Robotics","summary":" The convergence of many numerical optimization techniques is highly dependent\non the initial guess given to the solver. To address this issue, we propose a\nnovel approach that utilizes tensor methods to initialize existing optimization\nsolvers near global optima. Our method does not require access to a database of\ngood solutions. We first transform the cost function, which depends on both\ntask parameters and optimization variables, into a probability density\nfunction. Unlike existing approaches, the joint probability distribution of the\ntask parameters and optimization variables is approximated using the Tensor\nTrain model, which enables efficient conditioning and sampling. We treat the\ntask parameters as random variables, and for a given task, we generate samples\nfor decision variables from the conditional distribution to initialize the\noptimization solver. Our method can produce multiple solutions (when they\nexist) faster than existing methods. We first evaluate the approach on\nbenchmark functions for numerical optimization that are hard to solve using\ngradient-based optimization solvers with a naive initialization. The results\nshow that the proposed method can generate samples close to global optima and\nfrom multiple modes. We then demonstrate the generality and relevance of our\nframework to robotics by applying it to inverse kinematics with obstacles and\nmotion planning problems with a 7-DoF manipulator.\n","authors":["Suhan Shetty","Teguh Lembono","Tobias Loew","Sylvain Calinon"],"pdf_url":"https://arxiv.org/pdf/2206.05077v4.pdf","comment":"25 pages, 21 figures"},{"id":"http://arxiv.org/abs/2311.07784v2","updated":"2023-11-21T08:23:31Z","published":"2023-11-13T22:21:27Z","title":"A Data-Free Approach to Mitigate Catastrophic Forgetting in Federated\n Class Incremental Learning for Vision Tasks","summary":" Deep learning models often suffer from forgetting previously learned\ninformation when trained on new data. This problem is exacerbated in federated\nlearning (FL), where the data is distributed and can change independently for\neach user. Many solutions are proposed to resolve this catastrophic forgetting\nin a centralized setting. However, they do not apply directly to FL because of\nits unique complexities, such as privacy concerns and resource limitations. To\novercome these challenges, this paper presents a framework for\n$\\textbf{federated class incremental learning}$ that utilizes a generative\nmodel to synthesize samples from past distributions. This data can be later\nexploited alongside the training data to mitigate catastrophic forgetting. To\npreserve privacy, the generative model is trained on the server using data-free\nmethods at the end of each task without requesting data from clients. Moreover,\nour solution does not demand the users to store old data or models, which gives\nthem the freedom to join/leave the training at any time. Additionally, we\nintroduce SuperImageNet, a new regrouping of the ImageNet dataset specifically\ntailored for federated continual learning. We demonstrate significant\nimprovements compared to existing baselines through extensive experiments on\nmultiple datasets.\n","authors":["Sara Babakniya","Zalan Fabian","Chaoyang He","Mahdi Soltanolkotabi","Salman Avestimehr"],"pdf_url":"https://arxiv.org/pdf/2311.07784v2.pdf","comment":"Accepted in NeurIPS 2023. arXiv admin note: text overlap with\n arXiv:2307.00497"},{"id":"http://arxiv.org/abs/2008.12690v2","updated":"2023-11-21T08:18:01Z","published":"2020-08-28T14:46:56Z","title":"ROOT-SGD: Sharp Nonasymptotics and Asymptotic Efficiency in a Single\n Algorithm","summary":" We study the problem of solving strongly convex and smooth unconstrained\noptimization problems using stochastic first-order algorithms. We devise a\nnovel algorithm, referred to as \\emph{Recursive One-Over-T SGD} (\\ROOTSGD),\nbased on an easily implementable, recursive averaging of past stochastic\ngradients. We prove that it simultaneously achieves state-of-the-art\nperformance in both a finite-sample, nonasymptotic sense and an asymptotic\nsense. On the nonasymptotic side, we prove risk bounds on the last iterate of\n\\ROOTSGD with leading-order terms that match the optimal statistical risk with\na unity pre-factor, along with a higher-order term that scales at the sharp\nrate of $O(n^{-3/2})$ under the Lipschitz condition on the Hessian matrix. On\nthe asymptotic side, we show that when a mild, one-point Hessian continuity\ncondition is imposed, the rescaled last iterate of (multi-epoch) \\ROOTSGD\nconverges asymptotically to a Gaussian limit with the Cram\\'{e}r-Rao optimal\nasymptotic covariance, for a broad range of step-size choices.\n","authors":["Chris Junchi Li","Wenlong Mou","Martin J. Wainwright","Michael I. Jordan"],"pdf_url":"https://arxiv.org/pdf/2008.12690v2.pdf","comment":"Camera Ready, COLT 2022"},{"id":"http://arxiv.org/abs/2311.12419v1","updated":"2023-11-21T08:16:01Z","published":"2023-11-21T08:16:01Z","title":"Board-to-Board: Evaluating Moonboard Grade Prediction Generalization","summary":" Bouldering is a sport where athletes aim to climb up an obstacle using a set\nof defined holds called a route. Typically routes are assigned a grade to\ninform climbers of its difficulty and allow them to more easily track their\nprogression. However, the variation in individual climbers technical and\nphysical attributes and many nuances of an individual route make grading a\ndifficult and often biased task. In this work, we apply classical and\ndeep-learning modelling techniques to the 2016, 2017 and 2019 Moonboard\ndatasets, achieving state of the art grade prediction performance with 0.87 MAE\nand 1.12 RMSE. We achieve this performance on a feature-set that does not\nrequire decomposing routes into individual moves, which is a method common in\nliterature and introduces bias. We also demonstrate the generalization\ncapability of this model between editions and introduce a novel vision-based\nmethod of grade prediction. While the generalization performance of these\ntechniques is below human level performance currently, we propose these methods\nas a basis for future work. Such a tool could be implemented in pre-existing\nmobile applications and would allow climbers to better track their progress and\nassess new routes with reduced bias.\n","authors":["Daniel Petashvili","Matthew Rodda"],"pdf_url":"https://arxiv.org/pdf/2311.12419v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12410v1","updated":"2023-11-21T07:56:30Z","published":"2023-11-21T07:56:30Z","title":"nach0: Multimodal Natural and Chemical Languages Foundation Model","summary":" Large Language Models (LLMs) have substantially driven scientific progress in\nvarious domains, and many papers have demonstrated their ability to tackle\ncomplex problems with creative solutions. Our paper introduces a new foundation\nmodel, nach0, capable of solving various chemical and biological tasks:\nbiomedical question answering, named entity recognition, molecular generation,\nmolecular synthesis, attributes prediction, and others. nach0 is a multi-domain\nand multi-task encoder-decoder LLM pre-trained on unlabeled text from\nscientific literature, patents, and molecule strings to incorporate a range of\nchemical and linguistic knowledge. We employed instruction tuning, where\nspecific task-related instructions are utilized to fine-tune nach0 for the\nfinal set of tasks. To train nach0 effectively, we leverage the NeMo framework,\nenabling efficient parallel optimization of both base and large model versions.\nExtensive experiments demonstrate that our model outperforms state-of-the-art\nbaselines on single-domain and cross-domain tasks. Furthermore, it can generate\nhigh-quality outputs in molecular and textual formats, showcasing its\neffectiveness in multi-domain setups.\n","authors":["Micha Livne","Zulfat Miftahutdinov","Elena Tutubalina","Maksim Kuznetsov","Daniil Polykovskiy","Annika Brundyn","Aastha Jhunjhunwala","Anthony Costa","Alex Aliper","Alex Zhavoronkov"],"pdf_url":"https://arxiv.org/pdf/2311.12410v1.pdf","comment":"Submitted to Nature Communications"},{"id":"http://arxiv.org/abs/2202.06054v4","updated":"2023-11-21T07:47:04Z","published":"2022-02-12T12:42:36Z","title":"Towards Data-Algorithm Dependent Generalization: a Case Study on\n Overparameterized Linear Regression","summary":" One of the major open problems in machine learning is to characterize\ngeneralization in the overparameterized regime, where most traditional\ngeneralization bounds become inconsistent even for overparameterized linear\nregression. In many scenarios, this failure can be attributed to obscuring the\ncrucial interplay between the training algorithm and the underlying data\ndistribution. This paper demonstrate that the generalization behavior of\noverparameterized model should be analyzed in a both data-relevant and\nalgorithm-relevant manner. To make a formal characterization, We introduce a\nnotion called data-algorithm compatibility, which considers the generalization\nbehavior of the entire data-dependent training trajectory, instead of\ntraditional last-iterate analysis. We validate our claim by studying the\nsetting of solving overparameterized linear regression with gradient descent.\nSpecifically, we perform a data-dependent trajectory analysis and derive a\nsufficient condition for compatibility in such a setting. Our theoretical\nresults demonstrate that if we take early stopping iterates into consideration,\ngeneralization can hold with significantly weaker restrictions on the problem\ninstance than the previous last-iterate analysis.\n","authors":["Jing Xu","Jiaye Teng","Yang Yuan","Andrew Chi-Chih Yao"],"pdf_url":"https://arxiv.org/pdf/2202.06054v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09299v3","updated":"2023-11-21T07:34:26Z","published":"2023-10-07T09:09:19Z","title":"Digital Twin Assisted Deep Reinforcement Learning for Online Admission\n Control in Sliced Network","summary":" The proliferation of diverse wireless services in 5G and beyond has led to\nthe emergence of network slicing technologies. Among these, admission control\nplays a crucial role in achieving service-oriented optimization goals through\nthe selective acceptance of service requests. Although deep reinforcement\nlearning (DRL) forms the foundation in many admission control approaches thanks\nto its effectiveness and flexibility, initial instability with excessive\nconvergence delay of DRL models hinders their deployment in real-world\nnetworks. We propose a digital twin (DT) accelerated DRL solution to address\nthis issue. Specifically, we first formulate the admission decision-making\nprocess as a semi-Markov decision process, which is subsequently simplified\ninto an equivalent discrete-time Markov decision process to facilitate the\nimplementation of DRL methods. A neural network-based DT is established with a\ncustomized output layer for queuing systems, trained through supervised\nlearning, and then employed to assist the training phase of the DRL model.\nExtensive simulations show that the DT-accelerated DRL improves resource\nutilization by over 40% compared to the directly trained state-of-the-art\ndueling deep Q-learning model. This improvement is achieved while preserving\nthe model's capability to optimize the long-term rewards of the admission\nprocess.\n","authors":["Zhenyu Tao","Wei Xu","Xiaohu You"],"pdf_url":"https://arxiv.org/pdf/2310.09299v3.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.12399v1","updated":"2023-11-21T07:22:48Z","published":"2023-11-21T07:22:48Z","title":"A Survey of Graph Meets Large Language Model: Progress and Future\n Directions","summary":" Graph plays a significant role in representing and analyzing complex\nrelationships in real-world applications such as citation networks, social\nnetworks, and biological data. Recently, Large Language Models (LLMs), which\nhave achieved tremendous success in various domains, have also been leveraged\nin graph-related tasks to surpass traditional Graph Neural Networks (GNNs)\nbased methods and yield state-of-the-art performance. In this survey, we first\npresent a comprehensive review and analysis of existing methods that integrate\nLLMs with graphs. First of all, we propose a new taxonomy, which organizes\nexisting methods into three categories based on the role (i.e., enhancer,\npredictor, and alignment component) played by LLMs in graph-related tasks. Then\nwe systematically survey the representative methods along the three categories\nof the taxonomy. Finally, we discuss the remaining limitations of existing\nstudies and highlight promising avenues for future research. The relevant\npapers are summarized and will be consistently updated at:\nhttps://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks.\n","authors":["Yuhan Li","Zhixun Li","Peisong Wang","Jia Li","Xiangguo Sun","Hong Cheng","Jeffrey Xu Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12399v1.pdf","comment":"Work in progress; 13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2304.07647v2","updated":"2023-11-21T07:21:50Z","published":"2023-04-15T22:24:05Z","title":"LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene\n Graphs with Weak Supervision","summary":" We propose LASER, a neuro-symbolic approach to learn semantic video\nrepresentations that capture rich spatial and temporal properties in video data\nby leveraging high-level logic specifications. In particular, we formulate the\nproblem in terms of alignment between raw videos and spatio-temporal logic\nspecifications. The alignment algorithm leverages a differentiable symbolic\nreasoner and a combination of contrastive, temporal, and semantics losses. It\neffectively and efficiently trains low-level perception models to extract\nfine-grained video representation in the form of a spatio-temporal scene graph\nthat conforms to the desired high-level specification. In doing so, we explore\na novel methodology that weakly supervises the learning of video semantic\nrepresentations through logic specifications. We evaluate our method on two\ndatasets with rich spatial and temporal specifications:\n20BN-Something-Something and MUGEN. We demonstrate that our method learns\nbetter fine-grained video semantics than existing baselines.\n","authors":["Jiani Huang","Ziyang Li","Mayur Naik","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2304.07647v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04589v3","updated":"2023-11-21T07:16:51Z","published":"2022-08-09T08:07:21Z","title":"Long-term Causal Effects Estimation via Latent Surrogates Representation\n Learning","summary":" Estimating long-term causal effects based on short-term surrogates is a\nsignificant but challenging problem in many real-world applications, e.g.,\nmarketing and medicine. Despite its success in certain domains, most existing\nmethods estimate causal effects in an idealistic and simplistic way - ignoring\nthe causal structure among short-term outcomes and treating all of them as\nsurrogates. However, such methods cannot be well applied to real-world\nscenarios, in which the partially observed surrogates are mixed with their\nproxies among short-term outcomes. To this end, we develop our flexible method,\nLaser, to estimate long-term causal effects in the more realistic situation\nthat the surrogates are observed or have observed proxies.Given the\nindistinguishability between the surrogates and proxies, we utilize\nidentifiable variational auto-encoder (iVAE) to recover the whole valid\nsurrogates on all the surrogates candidates without the need of distinguishing\nthe observed surrogates or the proxies of latent surrogates. With the help of\nthe recovered surrogates, we further devise an unbiased estimation of long-term\ncausal effects. Extensive experimental results on the real-world and\nsemi-synthetic datasets demonstrate the effectiveness of our proposed method.\n","authors":["Ruichu Cai","Weilin Chen","Zeqin Yang","Shu Wan","Chen Zheng","Xiaoqing Yang","Jiecheng Guo"],"pdf_url":"https://arxiv.org/pdf/2208.04589v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10777v2","updated":"2023-11-21T07:15:57Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis (ABSA): Domains,\n Methods, and Trends","summary":" Aspect-based Sentiment Analysis (ABSA) is a type of fine-grained sentiment\nanalysis (SA) that identifies aspects and the associated opinions from a given\ntext. In the digital era, ABSA gained increasing popularity and applications in\nmining opinionated text data to obtain insights and support decisions. ABSA\nresearch employs linguistic, statistical, and machine-learning approaches and\nutilises resources such as labelled datasets, aspect and sentiment lexicons and\nontology. By its nature, ABSA is domain-dependent and can be sensitive to the\nimpact of misalignment between the resource and application domains. However,\nto our knowledge, this topic has not been explored by the existing ABSA\nliterature reviews. In this paper, we present a Systematic Literature Review\n(SLR) of ABSA studies with a focus on the research application domain, dataset\ndomain, and the research methods to examine their relationships and identify\ntrends over time. Our results suggest a number of potential systemic issues in\nthe ABSA research literature, including the predominance of the\n``product/service review'' dataset domain among the majority of studies that\ndid not have a specific research application domain, coupled with the\nprevalence of dataset-reliant methods such as supervised machine learning. This\nreview makes a number of unique contributions to the ABSA research field: 1) To\nour knowledge, it is the first SLR that links the research domain, dataset\ndomain, and research method through a systematic perspective; 2) it is one of\nthe largest scoped SLR on ABSA, with 519 eligible studies filtered from 4191\nsearch results without time constraint; and 3) our review methodology adopted\nan innovative automatic filtering process based on PDF-mining, which enhanced\nscreening quality and reliability. Suggestions and our review limitations are\nalso discussed.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10770v2","updated":"2023-11-21T06:59:59Z","published":"2023-11-15T18:42:50Z","title":"Exponentially Faster Language Modelling","summary":" Language models only really need to use an exponential fraction of their\nneurons for individual inferences. As proof, we present UltraFastBERT, a BERT\nvariant that uses 0.3% of its neurons during inference while performing on par\nwith similar BERT models. UltraFastBERT selectively engages just 12 out of 4095\nneurons for each layer inference. This is achieved by replacing feedforward\nnetworks with fast feedforward networks (FFFs). While no truly efficient\nimplementation currently exists to unlock the full acceleration potential of\nconditional neural execution, we provide high-level CPU code achieving 78x\nspeedup over the optimized baseline feedforward implementation, and a PyTorch\nimplementation delivering 40x speedup over the equivalent batched feedforward\ninference. We publish our training code, benchmarking setup, and model weights.\n","authors":["Peter Belcak","Roger Wattenhofer"],"pdf_url":"https://arxiv.org/pdf/2311.10770v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12379v1","updated":"2023-11-21T06:41:41Z","published":"2023-11-21T06:41:41Z","title":"Infinite forecast combinations based on Dirichlet process","summary":" Forecast combination integrates information from various sources by\nconsolidating multiple forecast results from the target time series. Instead of\nthe need to select a single optimal forecasting model, this paper introduces a\ndeep learning ensemble forecasting model based on the Dirichlet process.\nInitially, the learning rate is sampled with three basis distributions as\nhyperparameters to convert the infinite mixture into a finite one. All\ncheckpoints are collected to establish a deep learning sub-model pool, and\nweight adjustment and diversity strategies are developed during the combination\nprocess. The main advantage of this method is its ability to generate the\nrequired base learners through a single training process, utilizing the\ndecaying strategy to tackle the challenge posed by the stochastic nature of\ngradient descent in determining the optimal learning rate. To ensure the\nmethod's generalizability and competitiveness, this paper conducts an empirical\nanalysis using the weekly dataset from the M4 competition and explores\nsensitivity to the number of models to be combined. The results demonstrate\nthat the ensemble model proposed offers substantial improvements in prediction\naccuracy and stability compared to a single benchmark model.\n","authors":["Yinuo Ren","Feng Li","Yanfei Kang"],"pdf_url":"https://arxiv.org/pdf/2311.12379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10863v2","updated":"2023-11-21T06:15:56Z","published":"2023-11-17T20:51:24Z","title":"Verified Compositional Neuro-Symbolic Control for Stochastic Systems\n with Temporal Logic Tasks","summary":" Several methods have been proposed recently to learn neural network (NN)\ncontrollers for autonomous agents, with unknown and stochastic dynamics, tasked\nwith complex missions captured by Linear Temporal Logic (LTL). Due to the\nsample-inefficiency of the majority of these works, compositional learning\nmethods have been proposed decomposing the LTL specification into smaller\nsub-tasks. Then, separate controllers are learned and composed to satisfy the\noriginal task. A key challenge within these approaches is that they often lack\nsafety guarantees or the provided guarantees are impractical. This paper aims\nto address this challenge. Particularly, we consider autonomous systems with\nunknown and stochastic dynamics and LTL-encoded tasks. We assume that the\nsystem is equipped with a finite set of base skills modeled by trained NN\nfeedback controllers. Our goal is to check if there exists a temporal\ncomposition of the trained NN controllers - and if so, to compute it - that\nwill yield a composite system behavior that satisfies the assigned LTL task\nwith probability one. We propose a new approach that relies on a novel\nintegration of automata theory and data-driven reachability analysis tools for\nNN-controlled stochastic systems. The resulting neuro-symbolic controller\nallows the agent to generate safe behaviors for unseen complex temporal logic\ntasks in a zero-shot fashion by leveraging its base skills. We show correctness\nof the proposed method and we provide conditions under which it is complete. To\nthe best of our knowledge, this is the first work that designs verified\ntemporal compositions of NN controllers for unknown and stochastic systems.\nFinally, we provide extensive numerical simulations and hardware experiments on\nrobot navigation tasks to demonstrate the proposed method.\n","authors":["Jun Wang","Kaiyuan Tan","Zihe Sun","Yiannis Kantaros"],"pdf_url":"https://arxiv.org/pdf/2311.10863v2.pdf","comment":"The paper was withdrawn as it did not include the correct author\n list, credit was given to the wrong author"},{"id":"http://arxiv.org/abs/2311.01038v2","updated":"2023-11-21T05:48:06Z","published":"2023-11-02T07:09:59Z","title":"Better with Less: A Data-Active Perspective on Pre-Training Graph Neural\n Networks","summary":" Pre-training on graph neural networks (GNNs) aims to learn transferable\nknowledge for downstream tasks with unlabeled data, and it has recently become\nan active research area. The success of graph pre-training models is often\nattributed to the massive amount of input data. In this paper, however, we\nidentify the curse of big data phenomenon in graph pre-training: more training\ndata do not necessarily lead to better downstream performance. Motivated by\nthis observation, we propose a better-with-less framework for graph\npre-training: fewer, but carefully chosen data are fed into a GNN model to\nenhance pre-training. The proposed pre-training pipeline is called the\ndata-active graph pre-training (APT) framework, and is composed of a graph\nselector and a pre-training model. The graph selector chooses the most\nrepresentative and instructive data points based on the inherent properties of\ngraphs as well as predictive uncertainty. The proposed predictive uncertainty,\nas feedback from the pre-training model, measures the confidence level of the\nmodel in the data. When fed with the chosen data, on the other hand, the\npre-training model grasps an initial understanding of the new, unseen data, and\nat the same time attempts to remember the knowledge learned from previous data.\nTherefore, the integration and interaction between these two components form a\nunified framework (APT), in which graph pre-training is performed in a\nprogressive and iterative way. Experiment results show that the proposed APT is\nable to obtain an efficient pre-training model with fewer training data and\nbetter downstream performance.\n","authors":["Jiarong Xu","Renhong Huang","Xin Jiang","Yuxuan Cao","Carl Yang","Chunping Wang","Yang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.01038v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12359v1","updated":"2023-11-21T05:27:16Z","published":"2023-11-21T05:27:16Z","title":"Post-Training Quantization with Low-precision Minifloats and Integers on\n FPGAs","summary":" Post-Training Quantization (PTQ) is a powerful technique for model\ncompression, reducing the precision of neural networks without additional\ntraining overhead. Recent works have investigated adopting 8-bit floating-point\nquantization (FP8) in the context of PTQ for model inference. However, the\nexploration of floating-point formats smaller than 8 bits and their comparison\nwith integer quantization remains relatively limited. In this work, we present\nminifloats, which are reduced-precision floating-point formats capable of\nfurther reducing the memory footprint, latency, and energy cost of a model\nwhile approaching full-precision model accuracy. Our work presents a novel PTQ\ndesign-space exploration, comparing minifloat and integer quantization schemes\nacross a range of 3 to 8 bits for both weights and activations. We examine the\napplicability of various PTQ techniques to minifloats, including weight\nequalization, bias correction, SmoothQuant, gradient-based learned rounding,\nand the GPTQ method. Our experiments validate the effectiveness of\nlow-precision minifloats when compared to their integer counterparts across a\nspectrum of accuracy-precision trade-offs on a set of reference deep learning\nvision workloads. Finally, we evaluate our results against an FPGA-based\nhardware cost model, showing that integer quantization often remains the\nPareto-optimal option, given its relatively smaller hardware resource\nfootprint.\n","authors":["Shivam Aggarwal","Alessandro Pappalardo","Hans Jakob Damsgaard","Giuseppe Franco","Thomas B. Preußer","Michaela Blott","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.12359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12358v1","updated":"2023-11-21T05:26:33Z","published":"2023-11-21T05:26:33Z","title":"Federated Learning via Consensus Mechanism on Heterogeneous Data: A New\n Perspective on Convergence","summary":" Federated learning (FL) on heterogeneous data (non-IID data) has recently\nreceived great attention. Most existing methods focus on studying the\nconvergence guarantees for the global objective. While these methods can\nguarantee the decrease of the global objective in each communication round,\nthey fail to ensure risk decrease for each client. In this paper, to address\nthe problem,we propose FedCOME, which introduces a consensus mechanism to\nenforce decreased risk for each client after each training round. In\nparticular, we allow a slight adjustment to a client's gradient on the server\nside, which generates an acute angle between the corrected gradient and the\noriginal ones of other clients. We theoretically show that the consensus\nmechanism can guarantee the convergence of the global objective. To generalize\nthe consensus mechanism to the partial participation FL scenario, we devise a\nnovel client sampling strategy to select the most representative clients for\nthe global data distribution. Training on these selected clients with the\nconsensus mechanism could empirically lead to risk decrease for clients that\nare not selected. Finally, we conduct extensive experiments on four benchmark\ndatasets to show the superiority of FedCOME against other state-of-the-art\nmethods in terms of effectiveness, efficiency and fairness. For\nreproducibility, we make our source code publicly available at:\n\\url{https://github.com/fedcome/fedcome}.\n","authors":["Shu Zheng","Tiandi Ye","Xiang Li","Ming Gao"],"pdf_url":"https://arxiv.org/pdf/2311.12358v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12356v1","updated":"2023-11-21T05:22:39Z","published":"2023-11-21T05:22:39Z","title":"Random Linear Projections Loss for Hyperplane-Based Optimization in\n Regression Neural Networks","summary":" Despite their popularity across a wide range of domains, regression neural\nnetworks are prone to overfitting complex datasets. In this work, we propose a\nloss function termed Random Linear Projections (RLP) loss, which is empirically\nshown to mitigate overfitting. With RLP loss, the distance between sets of\nhyperplanes connecting fixed-size subsets of the neural network's\nfeature-prediction pairs and feature-label pairs is minimized. The intuition\nbehind this loss derives from the notion that if two functions share the same\nhyperplanes connecting all subsets of feature-label pairs, then these functions\nmust necessarily be equivalent. Our empirical studies, conducted across\nbenchmark datasets and representative synthetic examples, demonstrate the\nimprovements of the proposed RLP loss over mean squared error (MSE).\nSpecifically, neural networks trained with the RLP loss achieve better\nperformance while requiring fewer data samples and are more robust to additive\nnoise. We provide theoretical analysis supporting our empirical findings.\n","authors":["Shyam Venkatasubramanian","Ahmed Aloui","Vahid Tarokh"],"pdf_url":"https://arxiv.org/pdf/2311.12356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12355v1","updated":"2023-11-21T05:15:56Z","published":"2023-11-21T05:15:56Z","title":"Utilizing Language Models for Tour Itinerary Recommendation","summary":" Tour itinerary recommendation involves planning a sequence of relevant\nPoint-of-Interest (POIs), which combines challenges from the fields of both\nOperations Research (OR) and Recommendation Systems (RS). As an OR problem,\nthere is the need to maximize a certain utility (e.g., popularity of POIs in\nthe tour) while adhering to some constraints (e.g., maximum time for the tour).\nAs a RS problem, it is heavily related to problem or filtering or ranking a\nsubset of POIs that are relevant to a user and recommending it as part of an\nitinerary. In this paper, we explore the use of language models for the task of\ntour itinerary recommendation and planning. This task has the unique\nrequirement of recommending personalized POIs relevant to users and planning\nthese POIs as an itinerary that satisfies various constraints. We discuss some\napproaches in this area, such as using word embedding techniques like Word2Vec\nand GloVe for learning POI embeddings and transformer-based techniques like\nBERT for generating\n itineraries.\n","authors":["Ngai Lam Ho","Kwan Hui Lim"],"pdf_url":"https://arxiv.org/pdf/2311.12355v1.pdf","comment":"PMAI23 @IJCAI 2023 2nd International Workshop on Process Management\n in the AI era"},{"id":"http://arxiv.org/abs/2204.09157v2","updated":"2023-11-21T05:06:33Z","published":"2022-04-19T23:19:05Z","title":"Multifidelity Deep Operator Networks For Data-Driven and\n Physics-Informed Problems","summary":" Operator learning for complex nonlinear systems is increasingly common in\nmodeling multi-physics and multi-scale systems. However, training such\nhigh-dimensional operators requires a large amount of expensive, high-fidelity\ndata, either from experiments or simulations. In this work, we present a\ncomposite Deep Operator Network (DeepONet) for learning using two datasets with\ndifferent levels of fidelity to accurately learn complex operators when\nsufficient high-fidelity data is not available. Additionally, we demonstrate\nthat the presence of low-fidelity data can improve the predictions of\nphysics-informed learning with DeepONets. We demonstrate the new multi-fidelity\ntraining in diverse examples, including modeling of the ice-sheet dynamics of\nthe Humboldt glacier, Greenland, using two different fidelity models and also\nusing the same physical model at two different resolutions.\n","authors":["Amanda A. Howard","Mauro Perego","George E. Karniadakis","Panos Stinis"],"pdf_url":"https://arxiv.org/pdf/2204.09157v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12351v1","updated":"2023-11-21T04:59:17Z","published":"2023-11-21T04:59:17Z","title":"Advancing Transformer Architecture in Long-Context Large Language\n Models: A Comprehensive Survey","summary":" With the bomb ignited by ChatGPT, Transformer-based Large Language Models\n(LLMs) have paved a revolutionary path toward Artificial General Intelligence\n(AGI) and have been applied in diverse areas as knowledge bases, human\ninterfaces, and dynamic agents. However, a prevailing limitation exists: many\ncurrent LLMs, constrained by resources, are primarily pre-trained on shorter\ntexts, rendering them less effective for longer-context prompts, commonly\nencountered in real-world settings. In this paper, we present a comprehensive\nsurvey focusing on the advancement of model architecture in Transformer-based\nLLMs to optimize long-context capabilities across all stages from pre-training\nto inference. We firstly delineate and analyze the problems of handling\nlong-context input and output with the current Transformer-based models. Then,\nwe mainly offer a holistic taxonomy to navigate the landscape of Transformer\nupgrades on architecture to solve these problems. Afterward, we provide the\ninvestigation on wildly used evaluation necessities tailored for long-context\nLLMs, including datasets, metrics, and baseline models, as well as some amazing\noptimization toolkits like libraries, systems, and compilers to augment LLMs'\nefficiency and efficacy across different stages. Finally, we further discuss\nthe predominant challenges and potential avenues for future research in this\ndomain. Additionally, we have established a repository where we curate relevant\nliterature with real-time updates at\nhttps://github.com/Strivin0311/long-llms-learning.\n","authors":["Yunpeng Huang","Jingwei Xu","Zixu Jiang","Junyu Lai","Zenan Li","Yuan Yao","Taolue Chen","Lijuan Yang","Zhou Xin","Xiaoxing Ma"],"pdf_url":"https://arxiv.org/pdf/2311.12351v1.pdf","comment":"35 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.06483v2","updated":"2023-11-21T04:53:27Z","published":"2023-11-11T05:43:54Z","title":"Stacked networks improve physics-informed training: applications to\n neural networks and deep operator networks","summary":" Physics-informed neural networks and operator networks have shown promise for\neffectively solving equations modeling physical systems. However, these\nnetworks can be difficult or impossible to train accurately for some systems of\nequations. We present a novel multifidelity framework for stacking\nphysics-informed neural networks and operator networks that facilitates\ntraining. We successively build a chain of networks, where the output at one\nstep can act as a low-fidelity input for training the next step, gradually\nincreasing the expressivity of the learned model. The equations imposed at each\nstep of the iterative process can be the same or different (akin to simulated\nannealing). The iterative (stacking) nature of the proposed method allows us to\nprogressively learn features of a solution that are hard to learn directly.\nThrough benchmark problems including a nonlinear pendulum, the wave equation,\nand the viscous Burgers equation, we show how stacking can be used to improve\nthe accuracy and reduce the required size of physics-informed neural networks\nand operator networks.\n","authors":["Amanda A Howard","Sarah H Murphy","Shady E Ahmed","Panos Stinis"],"pdf_url":"https://arxiv.org/pdf/2311.06483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12345v1","updated":"2023-11-21T04:38:21Z","published":"2023-11-21T04:38:21Z","title":"Stable Diffusion For Aerial Object Detection","summary":" Aerial object detection is a challenging task, in which one major obstacle\nlies in the limitations of large-scale data collection and the long-tail\ndistribution of certain classes. Synthetic data offers a promising solution,\nespecially with recent advances in diffusion-based methods like stable\ndiffusion (SD). However, the direct application of diffusion methods to aerial\ndomains poses unique challenges: stable diffusion's optimization for rich\nground-level semantics doesn't align with the sparse nature of aerial objects,\nand the extraction of post-synthesis object coordinates remains problematic. To\naddress these challenges, we introduce a synthetic data augmentation framework\ntailored for aerial images. It encompasses sparse-to-dense region of interest\n(ROI) extraction to bridge the semantic gap, fine-tuning the diffusion model\nwith low-rank adaptation (LORA) to circumvent exhaustive retraining, and\nfinally, a Copy-Paste method to compose synthesized objects with backgrounds,\nproviding a nuanced approach to aerial object detection through synthetic data.\n","authors":["Yanan Jian","Fuxun Yu","Simranjit Singh","Dimitrios Stamoulis"],"pdf_url":"https://arxiv.org/pdf/2311.12345v1.pdf","comment":"Accepted at NeurIPS 2023 Synthetic Data Generation with Generative AI\n workshop"},{"id":"http://arxiv.org/abs/2304.14922v2","updated":"2023-11-21T04:25:31Z","published":"2023-04-24T05:21:10Z","title":"Supervised and Unsupervised Deep Learning Approaches for EEG Seizure\n Prediction","summary":" Epilepsy affects more than 50 million people worldwide, making it one of the\nworld's most prevalent neurological diseases. The main symptom of epilepsy is\nseizures, which occur abruptly and can cause serious injury or death. The\nability to predict the occurrence of an epileptic seizure could alleviate many\nrisks and stresses people with epilepsy face. We formulate the problem of\ndetecting preictal (or pre-seizure) with reference to normal EEG as a precursor\nto incoming seizure. To this end, we developed several supervised deep learning\napproaches model to identify preictal EEG from normal EEG. We further develop\nnovel unsupervised deep learning approaches to train the models on only normal\nEEG, and detecting pre-seizure EEG as an anomalous event. These deep learning\nmodels were trained and evaluated on two large EEG seizure datasets in a\nperson-specific manner. We found that both supervised and unsupervised\napproaches are feasible; however, their performance varies depending on the\npatient, approach and architecture. This new line of research has the potential\nto develop therapeutic interventions and save human lives.\n","authors":["Zakary Georgis-Yap","Milos R. Popovic","Shehroz S. Khan"],"pdf_url":"https://arxiv.org/pdf/2304.14922v2.pdf","comment":"16 figures, 9 tables"},{"id":"http://arxiv.org/abs/2307.01452v2","updated":"2023-11-21T03:43:15Z","published":"2023-07-04T03:00:43Z","title":"Causal Reinforcement Learning: A Survey","summary":" Reinforcement learning is an essential paradigm for solving sequential\ndecision problems under uncertainty. Despite many remarkable achievements in\nrecent decades, applying reinforcement learning methods in the real world\nremains challenging. One of the main obstacles is that reinforcement learning\nagents lack a fundamental understanding of the world and must therefore learn\nfrom scratch through numerous trial-and-error interactions. They may also face\nchallenges in providing explanations for their decisions and generalizing the\nacquired knowledge. Causality, however, offers a notable advantage as it can\nformalize knowledge in a systematic manner and leverage invariance for\neffective knowledge transfer. This has led to the emergence of causal\nreinforcement learning, a subfield of reinforcement learning that seeks to\nenhance existing algorithms by incorporating causal relationships into the\nlearning process. In this survey, we comprehensively review the literature on\ncausal reinforcement learning. We first introduce the basic concepts of\ncausality and reinforcement learning, and then explain how causality can\naddress core challenges in non-causal reinforcement learning. We categorize and\nsystematically review existing causal reinforcement learning approaches based\non their target problems and methodologies. Finally, we outline open issues and\nfuture directions in this emerging field.\n","authors":["Zhihong Deng","Jing Jiang","Guodong Long","Chengqi Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.01452v2.pdf","comment":"52 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.12329v1","updated":"2023-11-21T03:42:15Z","published":"2023-11-21T03:42:15Z","title":"Graph Neural Ordinary Differential Equations-based method for\n Collaborative Filtering","summary":" Graph Convolution Networks (GCNs) are widely considered state-of-the-art for\ncollaborative filtering. Although several GCN-based methods have been proposed\nand achieved state-of-the-art performance in various tasks, they can be\ncomputationally expensive and time-consuming to train if too many layers are\ncreated. However, since the linear GCN model can be interpreted as a\ndifferential equation, it is possible to transfer it to an ODE problem. This\ninspired us to address the computational limitations of GCN-based models by\ndesigning a simple and efficient NODE-based model that can skip some GCN layers\nto reach the final state, thus avoiding the need to create many layers. In this\nwork, we propose a Graph Neural Ordinary Differential Equation-based method for\nCollaborative Filtering (GODE-CF). This method estimates the final embedding by\nutilizing the information captured by one or two GCN layers. To validate our\napproach, we conducted experiments on multiple datasets. The results\ndemonstrate that our model outperforms competitive baselines, including\nGCN-based models and other state-of-the-art CF methods. Notably, our proposed\nGODE-CF model has several advantages over traditional GCN-based models. It is\nsimple, efficient, and has a fast training time, making it a practical choice\nfor real-world situations.\n","authors":["Ke Xu","Yuanjie Zhu","Weizhi Zhang","Philip S. Yu"],"pdf_url":"https://arxiv.org/pdf/2311.12329v1.pdf","comment":"Accepted by ICDM 2023"},{"id":"http://arxiv.org/abs/2311.12323v1","updated":"2023-11-21T03:34:20Z","published":"2023-11-21T03:34:20Z","title":"Modeling Political Orientation of Social Media Posts: An Extended\n Analysis","summary":" Developing machine learning models to characterize political polarization on\nonline social media presents significant challenges. These challenges mainly\nstem from various factors such as the lack of annotated data, presence of noise\nin social media datasets, and the sheer volume of data. The common research\npractice typically examines the biased structure of online user communities for\na given topic or qualitatively measuring the impacts of polarized topics on\nsocial media. However, there is limited work focusing on analyzing polarization\nat the ground-level, specifically in the social media posts themselves. Such\nexisting analysis heavily relies on annotated data, which often requires\nlaborious human labeling, offers labels only to specific problems, and lacks\nthe ability to determine the near-future bias state of a social media\nconversations. Understanding the degree of political orientation conveyed in\nsocial media posts is crucial for quantifying the bias of online user\ncommunities and investigating the spread of polarized content. In this work, we\nfirst introduce two heuristic methods that leverage on news media bias and post\ncontent to label social media posts. Next, we compare the efficacy and quality\nof heuristically labeled dataset with a randomly sampled human-annotated\ndataset. Additionally, we demonstrate that current machine learning models can\nexhibit improved performance in predicting political orientation of social\nmedia posts, employing both traditional supervised learning and few-shot\nlearning setups. We conduct experiments using the proposed heuristic methods\nand machine learning approaches to predict the political orientation of posts\ncollected from two social media forums with diverse political ideologies: Gab\nand Twitter.\n","authors":["Sadia Kamal","Brenner Little","Jade Gullic","Trevor Harms","Kristin Olofsson","Arunkumar Bagavathi"],"pdf_url":"https://arxiv.org/pdf/2311.12323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13108v2","updated":"2023-11-21T03:25:56Z","published":"2023-09-22T18:00:01Z","title":"Data is often loadable in short depth: Quantum circuits from tensor\n networks for finance, images, fluids, and proteins","summary":" Though there has been substantial progress in developing quantum algorithms\nto study classical datasets, the cost of simply loading classical data is an\nobstacle to quantum advantage. When the amplitude encoding is used, loading an\narbitrary classical vector requires up to exponential circuit depths with\nrespect to the number of qubits. Here, we address this \"input problem\" with two\ncontributions. First, we introduce a circuit compilation method based on tensor\nnetwork (TN) theory. Our method -- AMLET (Automatic Multi-layer Loader\nExploiting TNs) -- proceeds via careful construction of a specific TN topology\nand can be tailored to arbitrary circuit depths. Second, we perform numerical\nexperiments on real-world classical data from four distinct areas: finance,\nimages, fluid mechanics, and proteins. To the best of our knowledge, this is\nthe broadest numerical analysis to date of loading classical data into a\nquantum computer. Consistent with other recent work in this area, the required\ncircuit depths are often several orders of magnitude lower than the\nexponentially-scaling general loading algorithm would require. Besides\nintroducing a more efficient loading algorithm, this work demonstrates that\nmany classical datasets are loadable in depths that are much shorter than\npreviously expected, which has positive implications for speeding up classical\nworkloads on quantum computers.\n","authors":["Raghav Jumade","Nicolas PD Sawaya"],"pdf_url":"https://arxiv.org/pdf/2309.13108v2.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.03488v3","updated":"2023-11-21T03:08:37Z","published":"2023-11-06T19:52:55Z","title":"Multi-Resolution Diffusion for Privacy-Sensitive Recommender Systems","summary":" While recommender systems have become an integral component of the Web\nexperience, their heavy reliance on user data raises privacy and security\nconcerns. Substituting user data with synthetic data can address these\nconcerns, but accurately replicating these real-world datasets has been a\nnotoriously challenging problem. Recent advancements in generative AI have\ndemonstrated the impressive capabilities of diffusion models in generating\nrealistic data across various domains. In this work we introduce a Score-based\nDiffusion Recommendation Module (SDRM), which captures the intricate patterns\nof real-world datasets required for training highly accurate recommender\nsystems. SDRM allows for the generation of synthetic data that can replace\nexisting datasets to preserve user privacy, or augment existing datasets to\naddress excessive data sparsity. Our method outperforms competing baselines\nsuch as generative adversarial networks, variational autoencoders, and recently\nproposed diffusion models in synthesizing various datasets to replace or\naugment the original data by an average improvement of 4.30% in Recall@$k$ and\n4.65% in NDCG@$k$.\n","authors":["Derek Lilienthal","Paul Mello","Magdalini Eirinaki","Stas Tiomkin"],"pdf_url":"https://arxiv.org/pdf/2311.03488v3.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.12310v1","updated":"2023-11-21T03:02:33Z","published":"2023-11-21T03:02:33Z","title":"IEKM: A Model Incorporating External Keyword Matrices","summary":" A customer service platform system with a core text semantic similarity (STS)\ntask faces two urgent challenges: Firstly, one platform system needs to adapt\nto different domains of customers, i.e., different domains adaptation (DDA).\nSecondly, it is difficult for the model of the platform system to distinguish\nsentence pairs that are literally close but semantically different, i.e., hard\nnegative samples. In this paper, we propose an incorporation external keywords\nmatrices model (IEKM) to address these challenges. The model uses external\ntools or dictionaries to construct external matrices and fuses them to the\nself-attention layers of the Transformer structure through gating units, thus\nenabling flexible corrections to the model results. We evaluate the method on\nmultiple datasets and the results show that our method has improved performance\non all datasets. To demonstrate that our method can effectively solve all the\nabove challenges, we conduct a flexible correction experiment, which results in\nan increase in the F1 value from 56.61 to 73.53. Our code will be publicly\navailable.\n","authors":["Cheng Luo","Qin Li","Zhao Yan","Mengliang Rao","Yunbo Cao"],"pdf_url":"https://arxiv.org/pdf/2311.12310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12309v1","updated":"2023-11-21T03:02:30Z","published":"2023-11-21T03:02:30Z","title":"Power grid operational risk assessment using graph neural network\n surrogates","summary":" We investigate the utility of graph neural networks (GNNs) as proxies of\npower grid operational decision-making algorithms (optimal power flow (OPF) and\nsecurity-constrained unit commitment (SCUC)) to enable rigorous quantification\nof the operational risk. To conduct principled risk analysis, numerous Monte\nCarlo (MC) samples are drawn from the (foretasted) probability distributions of\nspatio-temporally correlated stochastic grid variables. The corresponding OPF\nand SCUC solutions, which are needed to quantify the risk, are generated using\ntraditional OPF and SCUC solvers to generate data for training GNN model(s).\nThe GNN model performance is evaluated in terms of the accuracy of predicting\nquantities of interests (QoIs) derived from the decision variables in OPF and\nSCUC. Specifically, we focus on thermal power generation and load shedding at\nsystem and individual zone level. We also perform reliability and risk\nquantification based on GNN predictions and compare with that obtained from\nOPF/SCUC solutions. Our results demonstrate that GNNs are capable of providing\nfast and accurate prediction of QoIs and thus can be good surrogate models for\nOPF and SCUC. The excellent accuracy of GNN-based reliability and risk\nassessment further suggests that GNN surrogate has the potential to be applied\nin real-time and hours-ahead risk quantification.\n","authors":["Yadong Zhang","Pranav M Karve","Sankaran Mahadevan"],"pdf_url":"https://arxiv.org/pdf/2311.12309v1.pdf","comment":"Manuscript submitted to IEEE PES GM 2024"},{"id":"http://arxiv.org/abs/2311.12304v1","updated":"2023-11-21T02:46:14Z","published":"2023-11-21T02:46:14Z","title":"Discovering Effective Policies for Land-Use Planning","summary":" How areas of land are allocated for different uses, such as forests, urban,\nand agriculture, has a large effect on carbon balance, and therefore climate\nchange. Based on available historical data on changes in land use and a\nsimulation of carbon emissions/absorption, a surrogate model can be learned\nthat makes it possible to evaluate the different options available to\ndecision-makers efficiently. An evolutionary search process can then be used to\ndiscover effective land-use policies for specific locations. Such a system was\nbuilt on the Project Resilience platform and evaluated with the Land-Use\nHarmonization dataset and the BLUE simulator. It generates Pareto fronts that\ntrade off carbon impact and amount of change customized to different locations,\nthus providing a potentially useful tool for land-use planning.\n","authors":["Risto Miikkulainen","Olivier Francon","Daniel Young","Elliot Meyerson","Babak Hodjat"],"pdf_url":"https://arxiv.org/pdf/2311.12304v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12303v1","updated":"2023-11-21T02:45:53Z","published":"2023-11-21T02:45:53Z","title":"Detecting subtle macroscopic changes in a finite temperature classical\n scalar field with machine learning","summary":" The ability to detect macroscopic changes is important for probing the\nbehaviors of experimental many-body systems from the classical to the quantum\nrealm. Although abrupt changes near phase boundaries can easily be detected,\nsubtle macroscopic changes are much more difficult to detect as the changes can\nbe obscured by noise. In this study, as a toy model for detecting subtle\nmacroscopic changes in many-body systems, we try to differentiate scalar field\nsamples at varying temperatures. We compare different methods for making such\ndifferentiations, from physics method, statistics method, to AI method. Our\nfinding suggests that the AI method outperforms both the statistical method and\nthe physics method in its sensitivity. Our result provides a proof-of-concept\nthat AI can potentially detect macroscopic changes in many-body systems that\nelude physical measures.\n","authors":["Jiming Yang","Yutong Zheng","Jiahong Zhou","Huiyu Li","Jun Yin"],"pdf_url":"https://arxiv.org/pdf/2311.12303v1.pdf","comment":"10 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.13230v2","updated":"2023-11-21T02:28:21Z","published":"2023-10-20T02:40:05Z","title":"Absolute Policy Optimization","summary":" In recent years, trust region on-policy reinforcement learning has achieved\nimpressive results in addressing complex control tasks and gaming scenarios.\nHowever, contemporary state-of-the-art algorithms within this category\nprimarily emphasize improvement in expected performance, lacking the ability to\ncontrol over the worst-case performance outcomes. To address this limitation,\nwe introduce a novel objective function; by optimizing which, it will lead to\nguaranteed monotonic improvement in the lower bound of near-total performance\nsamples (absolute performance). Considering this groundbreaking theoretical\nadvancement, we then refine this theoretically grounded algorithm through a\nseries of approximations, resulting in a practical solution called Absolute\nPolicy Optimization (APO). Our experiments demonstrate the effectiveness of our\napproach across challenging continuous control benchmark tasks and extend its\napplicability to mastering Atari games. Our findings reveal that APO\nsignificantly outperforms state-of-the-art policy gradient algorithms,\nresulting in substantial improvements in both expected performance and\nworst-case performance.\n","authors":["Weiye Zhao","Feihan Li","Yifan Sun","Rui Chen","Tianhao Wei","Changliu Liu"],"pdf_url":"https://arxiv.org/pdf/2310.13230v2.pdf","comment":"I submitted this article to Journal of Machine Learning Research. The\n manuscript will go under a major revision and I don't want the reviewer know\n who I am. I will re-upload after JMLR review released"},{"id":"http://arxiv.org/abs/2311.12292v1","updated":"2023-11-21T02:24:52Z","published":"2023-11-21T02:24:52Z","title":"Mapping \"Brain Coral\" Regions on Mars using Deep Learning","summary":" One of the main objectives of the Mars Exploration Program is to search for\nevidence of past or current life on the planet. To achieve this, Mars\nexploration has been focusing on regions that may have liquid or frozen water.\nA set of critical areas may have seen cycles of ice thawing in the relatively\nrecent past in response to periodic changes in the obliquity of Mars. In this\nwork, we use convolutional neural networks to detect surface regions containing\n\"Brain Coral\" terrain, a landform on Mars whose similarity in morphology and\nscale to sorted stone circles on Earth suggests that it may have formed as a\nconsequence of freeze/thaw cycles. We use large images (~100-1000 megapixels)\nfrom the Mars Reconnaissance Orbiter to search for these landforms at\nresolutions close to a few tens of centimeters per pixel (~25--50 cm). Over\n52,000 images (~28 TB) were searched (~5% of the Martian surface) where we\nfound detections in over 200 images. To expedite the processing we leverage a\nclassifier network (prior to segmentation) in the Fourier domain that can take\nadvantage of JPEG compression by leveraging blocks of coefficients from a\ndiscrete cosine transform in lieu of decoding the entire image at the full\nspatial resolution. The hybrid pipeline approach maintains ~93% accuracy while\ncutting down on ~95% of the total processing time compared to running the\nsegmentation network at the full resolution on every image. The timely\nprocessing of big data sets helps inform mission operations, geologic surveys\nto prioritize candidate landing sites, avoid hazardous areas, or map the\nspatial extent of certain terrain. The segmentation masks and source code are\navailable on Github for the community to explore and build upon.\n","authors":["Kyle A. Pearson","Eldar Noe","Daniel Zhao","Alphan Altinok","Alex Morgan"],"pdf_url":"https://arxiv.org/pdf/2311.12292v1.pdf","comment":"Submitted for publication, seeking comments from the community. Code\n available: https://github.com/pearsonkyle/Mars-Brain-Coral-Network"},{"id":"http://arxiv.org/abs/2311.10899v2","updated":"2023-11-21T02:16:27Z","published":"2023-11-17T22:44:05Z","title":"Extraction and Summarization of Explicit Video Content using Multi-Modal\n Deep Learning","summary":" With the increase in video-sharing platforms across the internet, it is\ndifficult for humans to moderate the data for explicit content. Hence, an\nautomated pipeline to scan through video data for explicit content has become\nthe need of the hour. We propose a novel pipeline that uses multi-modal deep\nlearning to first extract the explicit segments of input videos and then\nsummarize their content using text to determine its age appropriateness and age\nrating. We also evaluate our pipeline's effectiveness in the end using standard\nmetrics.\n","authors":["Shaunak Joshi","Raghav Gaggar"],"pdf_url":"https://arxiv.org/pdf/2311.10899v2.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2304.07927v2","updated":"2023-11-21T02:15:33Z","published":"2023-04-17T00:38:01Z","title":"A Randomized Approach for Tight Privacy Accounting","summary":" Bounding privacy leakage over compositions, i.e., privacy accounting, is a\nkey challenge in differential privacy (DP). The privacy parameter ($\\eps$ or\n$\\delta$) is often easy to estimate but hard to bound. In this paper, we\npropose a new differential privacy paradigm called estimate-verify-release\n(EVR), which addresses the challenges of providing a strict upper bound for\nprivacy parameter in DP compositions by converting an estimate of privacy\nparameter into a formal guarantee. The EVR paradigm first estimates the privacy\nparameter of a mechanism, then verifies whether it meets this guarantee, and\nfinally releases the query output based on the verification result. The core\ncomponent of the EVR is privacy verification. We develop a randomized privacy\nverifier using Monte Carlo (MC) technique. Furthermore, we propose an MC-based\nDP accountant that outperforms existing DP accounting techniques in terms of\naccuracy and efficiency. Our empirical evaluation shows the newly proposed EVR\nparadigm improves the utility-privacy tradeoff for privacy-preserving machine\nlearning.\n","authors":["Jiachen T. Wang","Saeed Mahloujifar","Tong Wu","Ruoxi Jia","Prateek Mittal"],"pdf_url":"https://arxiv.org/pdf/2304.07927v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.13059v1","updated":"2023-11-21T23:46:44Z","published":"2023-11-21T23:46:44Z","title":"A note on estimating the dimension from a random geometric graph","summary":" Let $G_n$ be a random geometric graph with vertex set $[n]$ based on $n$\ni.i.d.\\ random vectors $X_1,\\ldots,X_n$ drawn from an unknown density $f$ on\n$\\R^d$. An edge $(i,j)$ is present when $\\|X_i -X_j\\| \\le r_n$, for a given\nthreshold $r_n$ possibly depending upon $n$, where $\\| \\cdot \\|$ denotes\nEuclidean distance. We study the problem of estimating the dimension $d$ of the\nunderlying space when we have access to the adjacency matrix of the graph but\ndo not know $r_n$ or the vectors $X_i$. The main result of the paper is that\nthere exists an estimator of $d$ that converges to $d$ in probability as $n \\to\n\\infty$ for all densities with $\\int f^5 < \\infty$ whenever $n^{3/2} r_n^d \\to\n\\infty$ and $r_n = o(1)$. The conditions allow very sparse graphs since when\n$n^{3/2} r_n^d \\to 0$, the graph contains isolated edges only, with high\nprobability. We also show that, without any condition on the density, a\nconsistent estimator of $d$ exists when $n r_n^d \\to \\infty$ and $r_n = o(1)$.\n","authors":["Caelan Atamanchuk","Luc Devroye","Gabor Lugosi"],"pdf_url":"https://arxiv.org/pdf/2311.13059v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17301v2","updated":"2023-11-21T23:42:55Z","published":"2023-06-29T20:58:48Z","title":"Why Shallow Networks Struggle with Approximating and Learning High\n Frequency: A Numerical Study","summary":" In this work, a comprehensive numerical study involving analysis and\nexperiments shows why a two-layer neural network has difficulties handling high\nfrequencies in approximation and learning when machine precision and\ncomputation cost are important factors in real practice. In particular, the\nfollowing basic computational issues are investigated: (1) the minimal\nnumerical error one can achieve given a finite machine precision, (2) the\ncomputation cost to achieve a given accuracy, and (3) stability with respect to\nperturbations. The key to the study is the conditioning of the representation\nand its learning dynamics. Explicit answers to the above questions with\nnumerical verifications are presented.\n","authors":["Shijun Zhang","Hongkai Zhao","Yimin Zhong","Haomin Zhou"],"pdf_url":"https://arxiv.org/pdf/2306.17301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13052v1","updated":"2023-11-21T23:25:04Z","published":"2023-11-21T23:25:04Z","title":"Novel OCT mosaicking pipeline with Feature- and Pixel-based registration","summary":" High-resolution Optical Coherence Tomography (OCT) images are crucial for\nophthalmology studies but are limited by their relatively narrow field of view\n(FoV). Image mosaicking is a technique for aligning multiple overlapping images\nto obtain a larger FoV. Current mosaicking pipelines often struggle with\nsubstantial noise and considerable displacement between the input sub-fields.\nIn this paper, we propose a versatile pipeline for stitching multi-view\nOCT/OCTA \\textit{en face} projection images. Our method combines the strengths\nof learning-based feature matching and robust pixel-based registration to align\nmultiple images effectively. Furthermore, we advance the application of a\ntrained foundational model, Segment Anything Model (SAM), to validate\nmosaicking results in an unsupervised manner. The efficacy of our pipeline is\nvalidated using an in-house dataset and a large public dataset, where our\nmethod shows superior performance in terms of both accuracy and computational\nefficiency. We also made our evaluation tool for image mosaicking and the\ncorresponding pipeline publicly available at\n\\url{https://github.com/MedICL-VU/OCT-mosaicking}.\n","authors":["Jiacheng Wang","Hao Li","Dewei Hu","Yuankai K. Tao","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2311.13052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16020v2","updated":"2023-11-21T23:23:08Z","published":"2023-09-27T20:54:56Z","title":"GeoCLIP: Clip-Inspired Alignment between Locations and Images for\n Effective Worldwide Geo-localization","summary":" Worldwide Geo-localization aims to pinpoint the precise location of images\ntaken anywhere on Earth. This task has considerable challenges due to immense\nvariation in geographic landscapes. The image-to-image retrieval-based\napproaches fail to solve this problem on a global scale as it is not feasible\nto construct a large gallery of images covering the entire world. Instead,\nexisting approaches divide the globe into discrete geographic cells,\ntransforming the problem into a classification task. However, their performance\nis limited by the predefined classes and often results in inaccurate\nlocalizations when an image's location significantly deviates from its class\ncenter. To overcome these limitations, we propose GeoCLIP, a novel\nCLIP-inspired Image-to-GPS retrieval approach that enforces alignment between\nthe image and its corresponding GPS locations. GeoCLIP's location encoder\nmodels the Earth as a continuous function by employing positional encoding\nthrough random Fourier features and constructing a hierarchical representation\nthat captures information at varying resolutions to yield a semantically rich\nhigh-dimensional feature suitable to use even beyond geo-localization. To the\nbest of our knowledge, this is the first work employing GPS encoding for\ngeo-localization. We demonstrate the efficacy of our method via extensive\nexperiments and ablations on benchmark datasets. We achieve competitive\nperformance with just 20% of training data, highlighting its effectiveness even\nin limited-data settings. Furthermore, we qualitatively demonstrate\ngeo-localization using a text query by leveraging CLIP backbone of our image\nencoder. The project webpage is available at:\nhttps://vicentevivan.github.io/GeoCLIP\n","authors":["Vicente Vivanco Cepeda","Gaurav Kumar Nayak","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2309.16020v2.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.13050v1","updated":"2023-11-21T23:22:11Z","published":"2023-11-21T23:22:11Z","title":"Multi-fidelity Bayesian Optimization in Engineering Design","summary":" Resided at the intersection of multi-fidelity optimization (MFO) and Bayesian\noptimization (BO), MF BO has found a niche in solving expensive engineering\ndesign optimization problems, thanks to its advantages in incorporating\nphysical and mathematical understandings of the problems, saving resources,\naddressing exploitation-exploration trade-off, considering uncertainty, and\nprocessing parallel computing. The increasing number of works dedicated to MF\nBO suggests the need for a comprehensive review of this advanced optimization\ntechnique. In this paper, we survey recent developments of two essential\ningredients of MF BO: Gaussian process (GP) based MF surrogates and acquisition\nfunctions. We first categorize the existing MF modeling methods and MFO\nstrategies to locate MF BO in a large family of surrogate-based optimization\nand MFO algorithms. We then exploit the common properties shared between the\nmethods from each ingredient of MF BO to describe important GP-based MF\nsurrogate models and review various acquisition functions. By doing so, we\nexpect to provide a structured understanding of MF BO. Finally, we attempt to\nreveal important aspects that require further research for applications of MF\nBO in solving intricate yet important design optimization problems, including\nconstrained optimization, high-dimensional optimization, optimization under\nuncertainty, and multi-objective optimization.\n","authors":["Bach Do","Ruda Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13050v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13046v1","updated":"2023-11-21T23:14:47Z","published":"2023-11-21T23:14:47Z","title":"Do we listen to what we are told? An empirical study on human behaviour\n during the COVID-19 pandemic: neural networks vs. regression analysis","summary":" In this work, we contribute the first visual open-source empirical study on\nhuman behaviour during the COVID-19 pandemic, in order to investigate how\ncompliant a general population is to mask-wearing-related public-health policy.\nObject-detection-based convolutional neural networks, regression analysis and\nmultilayer perceptrons are combined to analyse visual data of the Viennese\npublic during 2020. We find that mask-wearing-related government regulations\nand public-transport announcements encouraged correct mask-wearing-behaviours\nduring the COVID-19 pandemic. Importantly, changes in announcement and\nregulation contents led to heterogeneous effects on people's behaviour.\nComparing the predictive power of regression analysis and neural networks, we\ndemonstrate that the latter produces more accurate predictions of population\nreactions during the COVID-19 pandemic. Our use of regression modelling also\nallows us to unearth possible causal pathways underlying societal behaviour.\nSince our findings highlight the importance of appropriate communication\ncontents, our results will facilitate more effective non-pharmaceutical\ninterventions to be developed in future. Adding to the literature, we\ndemonstrate that regression modelling and neural networks are not mutually\nexclusive but instead complement each other.\n","authors":["Yuxi Heluo","Kexin Wang","Charles W. Robson"],"pdf_url":"https://arxiv.org/pdf/2311.13046v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.13933v2","updated":"2023-11-21T23:06:51Z","published":"2022-08-30T00:08:37Z","title":"Using Taylor-Approximated Gradients to Improve the Frank-Wolfe Method\n for Empirical Risk Minimization","summary":" The Frank-Wolfe method has become increasingly useful in statistical and\nmachine learning applications, due to the structure-inducing properties of the\niterates, and especially in settings where linear minimization over the\nfeasible set is more computationally efficient than projection. In the setting\nof Empirical Risk Minimization -- one of the fundamental optimization problems\nin statistical and machine learning -- the computational effectiveness of\nFrank-Wolfe methods typically grows linearly in the number of data observations\n$n$. This is in stark contrast to the case for typical stochastic projection\nmethods. In order to reduce this dependence on $n$, we look to second-order\nsmoothness of typical smooth loss functions (least squares loss and logistic\nloss, for example) and we propose amending the Frank-Wolfe method with Taylor\nseries-approximated gradients, including variants for both deterministic and\nstochastic settings. Compared with current state-of-the-art methods in the\nregime where the optimality tolerance $\\varepsilon$ is sufficiently small, our\nmethods are able to simultaneously reduce the dependence on large $n$ while\nobtaining optimal convergence rates of Frank-Wolfe methods, in both the convex\nand non-convex settings. We also propose a novel adaptive step-size approach\nfor which we have computational guarantees. Last of all, we present\ncomputational experiments which show that our methods exhibit very significant\nspeed-ups over existing methods on real-world datasets for both convex and\nnon-convex binary classification problems.\n","authors":["Zikai Xiong","Robert M. Freund"],"pdf_url":"https://arxiv.org/pdf/2208.13933v2.pdf","comment":"30 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.10242v2","updated":"2023-11-21T23:01:29Z","published":"2023-11-17T00:08:19Z","title":"Advancements in Generative AI: A Comprehensive Review of GANs, GPT,\n Autoencoders, Diffusion Model, and Transformers","summary":" The launch of ChatGPT has garnered global attention, marking a significant\nmilestone in the field of Generative Artificial Intelligence. While Generative\nAI has been in effect for the past decade, the introduction of ChatGPT has\nignited a new wave of research and innovation in the AI domain. This surge in\ninterest has led to the development and release of numerous cutting-edge tools,\nsuch as Bard, Stable Diffusion, DALL-E, Make-A-Video, Runway ML, and Jukebox,\namong others. These tools exhibit remarkable capabilities, encompassing tasks\nranging from text generation and music composition, image creation, video\nproduction, code generation, and even scientific work. They are built upon\nvarious state-of-the-art models, including Stable Diffusion, transformer models\nlike GPT-3 (recent GPT-4), variational autoencoders, and generative adversarial\nnetworks. This advancement in Generative AI presents a wealth of exciting\nopportunities and, simultaneously, unprecedented challenges. Throughout this\npaper, we have explored these state-of-the-art models, the diverse array of\ntasks they can accomplish, the challenges they pose, and the promising future\nof Generative Artificial Intelligence.\n","authors":["Staphord Bengesi","Hoda El-Sayed","Md Kamruzzaman Sarker","Yao Houkpati","John Irungu","Timothy Oladunni"],"pdf_url":"https://arxiv.org/pdf/2311.10242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13038v1","updated":"2023-11-21T22:56:13Z","published":"2023-11-21T22:56:13Z","title":"Synaptic Sampling of Neural Networks","summary":" Probabilistic artificial neural networks offer intriguing prospects for\nenabling the uncertainty of artificial intelligence methods to be described\nexplicitly in their function; however, the development of techniques that\nquantify uncertainty by well-understood methods such as Monte Carlo sampling\nhas been limited by the high costs of stochastic sampling on deterministic\ncomputing hardware. Emerging computing systems that are amenable to\nhardware-level probabilistic computing, such as those that leverage stochastic\ndevices, may make probabilistic neural networks more feasible in the\nnot-too-distant future. This paper describes the scANN technique --\n\\textit{sampling (by coinflips) artificial neural networks} -- which enables\nneural networks to be sampled directly by treating the weights as Bernoulli\ncoin flips. This method is natively well suited for probabilistic computing\ntechniques that focus on tunable stochastic devices, nearly matches fully\ndeterministic performance while also describing the uncertainty of correct and\nincorrect neural network outputs.\n","authors":["James B. Aimone","William Severa","J. Darby Smith"],"pdf_url":"https://arxiv.org/pdf/2311.13038v1.pdf","comment":"9 pages, accepted to 2023 IEEE International Conference on Rebooting\n Computing"},{"id":"http://arxiv.org/abs/2311.13036v1","updated":"2023-11-21T22:53:20Z","published":"2023-11-21T22:53:20Z","title":"Favour: FAst Variance Operator for Uncertainty Rating","summary":" Bayesian Neural Networks (BNN) have emerged as a crucial approach for\ninterpreting ML predictions. By sampling from the posterior distribution, data\nscientists may estimate the uncertainty of an inference. Unfortunately many\ninference samples are often needed, the overhead of which greatly hinder BNN's\nwide adoption. To mitigate this, previous work proposed propagating the first\nand second moments of the posterior directly through the network. However, on\nits own this method is even slower than sampling, so the propagated variance\nneeds to be approximated such as assuming independence between neural nodes.\nThe resulting trade-off between quality and inference time did not match even\nplain Monte Carlo sampling.\n Our contribution is a more principled variance propagation framework based on\n\"spiked covariance matrices\", which smoothly interpolates between quality and\ninference time. This is made possible by a new fast algorithm for updating a\ndiagonal-plus-low-rank matrix approximation under various operations. We tested\nour algorithm against sampling based MC Dropout and Variational Inference on a\nnumber of downstream uncertainty themed tasks, such as calibration and\nout-of-distribution testing. We find that Favour is as fast as performing 2-3\ninference samples, while matching the performance of 10-100 samples.\n In summary, this work enables the use of BNN in the realm of performance\ncritical tasks where they have previously been out of reach.\n","authors":["Thomas D. Ahle","Sahar Karimi","Peter Tak Peter Tang"],"pdf_url":"https://arxiv.org/pdf/2311.13036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08549v3","updated":"2023-11-21T22:49:27Z","published":"2023-09-15T17:12:19Z","title":"HINT: Healthy Influential-Noise based Training to Defend against Data\n Poisoning Attacks","summary":" While numerous defense methods have been proposed to prohibit potential\npoisoning attacks from untrusted data sources, most research works only defend\nagainst specific attacks, which leaves many avenues for an adversary to\nexploit. In this work, we propose an efficient and robust training approach to\ndefend against data poisoning attacks based on influence functions, named\nHealthy Influential-Noise based Training. Using influence functions, we craft\nhealthy noise that helps to harden the classification model against poisoning\nattacks without significantly affecting the generalization ability on test\ndata. In addition, our method can perform effectively when only a subset of the\ntraining data is modified, instead of the current method of adding noise to all\nexamples that has been used in several previous works. We conduct comprehensive\nevaluations over two image datasets with state-of-the-art poisoning attacks\nunder different realistic attack scenarios. Our empirical results show that\nHINT can efficiently protect deep learning models against the effect of both\nuntargeted and targeted poisoning attacks.\n","authors":["Minh-Hao Van","Alycia N. Carey","Xintao Wu"],"pdf_url":"https://arxiv.org/pdf/2309.08549v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11280v2","updated":"2023-11-21T22:35:18Z","published":"2023-07-21T00:49:07Z","title":"Epsilon*: Privacy Metric for Machine Learning Models","summary":" We introduce Epsilon*, a new privacy metric for measuring the privacy risk of\na single model instance prior to, during, or after deployment of privacy\nmitigation strategies. The metric requires only black-box access to model\npredictions, does not require training data re-sampling or model re-training,\nand can be used to measure the privacy risk of models not trained with\ndifferential privacy. Epsilon* is a function of true positive and false\npositive rates in a hypothesis test used by an adversary in a membership\ninference attack. We distinguish between quantifying the privacy loss of a\ntrained model instance, which we refer to as empirical privacy, and quantifying\nthe privacy loss of the training mechanism which produces this model instance.\nExisting approaches in the privacy auditing literature provide lower bounds for\nthe latter, while our metric provides an empirical lower bound for the former\nby relying on an (${\\epsilon}$, ${\\delta}$)-type of quantification of the\nprivacy of the trained model instance. We establish a relationship between\nthese lower bounds and show how to implement Epsilon* to avoid numerical and\nnoise amplification instability. We further show in experiments on benchmark\npublic data sets that Epsilon* is sensitive to privacy risk mitigation by\ntraining with differential privacy (DP), where the value of Epsilon* is reduced\nby up to 800% compared to the Epsilon* values of non-DP trained baseline\nmodels. This metric allows privacy auditors to be independent of model owners,\nand enables visualizing the privacy-utility landscape to make informed\ndecisions regarding the trade-offs between model privacy and utility.\n","authors":["Diana M. Negoescu","Humberto Gonzalez","Saad Eddin Al Orjany","Jilei Yang","Yuliia Lut","Rahul Tandra","Xiaowen Zhang","Xinyi Zheng","Zach Douglas","Vidita Nolkha","Parvez Ahammad","Gennady Samorodnitsky"],"pdf_url":"https://arxiv.org/pdf/2307.11280v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13028v1","updated":"2023-11-21T22:29:25Z","published":"2023-11-21T22:29:25Z","title":"DMLR: Data-centric Machine Learning Research -- Past, Present and Future","summary":" Drawing from discussions at the inaugural DMLR workshop at ICML 2023 and\nmeetings prior, in this report we outline the relevance of community engagement\nand infrastructure development for the creation of next-generation public\ndatasets that will advance machine learning science. We chart a path forward as\na collective effort to sustain the creation and maintenance of these datasets\nand methods towards positive scientific, societal and business impact.\n","authors":["Luis Oala","Manil Maskey","Lilith Bat-Leah","Alicia Parrish","Nezihe Merve Gürel","Tzu-Sheng Kuo","Yang Liu","Rotem Dror","Danilo Brajovic","Xiaozhe Yao","Max Bartolo","William A Gaviria Rojas","Ryan Hileman","Rainier Aliment","Michael W. Mahoney","Meg Risdal","Matthew Lease","Wojciech Samek","Debojyoti Dutta","Curtis G Northcutt","Cody Coleman","Braden Hancock","Bernard Koch","Girmaw Abebe Tadesse","Bojan Karlaš","Ahmed Alaa","Adji Bousso Dieng","Natasha Noy","Vijay Janapa Reddi","James Zou","Praveen Paritosh","Mihaela van der Schaar","Kurt Bollacker","Lora Aroyo","Ce Zhang","Joaquin Vanschoren","Isabelle Guyon","Peter Mattson"],"pdf_url":"https://arxiv.org/pdf/2311.13028v1.pdf","comment":"This editorial report accompanies the inaugural Data-centric Machine\n Learning Research (DMLR) Workshop that took place at ICML 2023\n https://dmlr.ai/"},{"id":"http://arxiv.org/abs/2310.15290v2","updated":"2023-11-21T22:19:09Z","published":"2023-10-23T18:56:01Z","title":"Reliable Generation of EHR Time Series via Diffusion Models","summary":" Electronic Health Records (EHRs) are rich sources of patient-level data,\nincluding laboratory tests, medications, and diagnoses, offering valuable\nresources for medical data analysis. However, concerns about privacy often\nrestrict access to EHRs, hindering downstream analysis. Researchers have\nexplored various methods for generating privacy-preserving EHR data. In this\nstudy, we introduce a new method for generating diverse and realistic synthetic\nEHR time series data using Denoising Diffusion Probabilistic Models (DDPM). We\nconducted experiments on six datasets, comparing our proposed method with eight\nexisting methods. Our results demonstrate that our approach significantly\noutperforms all existing methods in terms of data utility while requiring less\ntraining effort. Our approach also enhances downstream medical data analysis by\nproviding diverse and realistic synthetic EHR data.\n","authors":["Muhang Tian","Bernie Chen","Allan Guo","Shiyi Jiang","Anru R. Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.15290v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01897v2","updated":"2023-11-21T22:08:41Z","published":"2023-09-05T02:15:08Z","title":"Inferring Actual Treatment Pathways from Patient Records","summary":" Treatment pathways are step-by-step plans outlining the recommended medical\ncare for specific diseases; they get revised when different treatments are\nfound to improve patient outcomes. Examining health records is an important\npart of this revision process, but inferring patients' actual treatments from\nhealth data is challenging due to complex event-coding schemes and the absence\nof pathway-related annotations. This study aims to infer the actual treatment\nsteps for a particular patient group from administrative health records (AHR) -\na common form of tabular healthcare data - and address several technique- and\nmethodology-based gaps in treatment pathway-inference research. We introduce\nDefrag, a method for examining AHRs to infer the real-world treatment steps for\na particular patient group. Defrag learns the semantic and temporal meaning of\nhealthcare event sequences, allowing it to reliably infer treatment steps from\ncomplex healthcare data. To our knowledge, Defrag is the first\npathway-inference method to utilise a neural network (NN), an approach made\npossible by a novel, self-supervised learning objective. We also developed a\ntesting and validation framework for pathway inference, which we use to\ncharacterise and evaluate Defrag's pathway inference ability and compare\nagainst baselines. We demonstrate Defrag's effectiveness by identifying\nbest-practice pathway fragments for breast cancer, lung cancer, and melanoma in\npublic healthcare records. Additionally, we use synthetic data experiments to\ndemonstrate the characteristics of the Defrag method, and to compare Defrag to\nseveral baselines where it significantly outperforms non-NN-based methods.\nDefrag significantly outperforms several existing pathway-inference methods and\noffers an innovative and effective approach for inferring treatment pathways\nfrom AHRs. Open-source code is provided to encourage further research in this\narea.\n","authors":["Adrian Wilkins-Caruana","Madhushi Bandara","Katarzyna Musial","Daniel Catchpoole","Paul J. Kennedy"],"pdf_url":"https://arxiv.org/pdf/2309.01897v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.03357v2","updated":"2023-11-21T22:05:37Z","published":"2023-07-07T02:40:09Z","title":"Stability and Generalization of Stochastic Compositional Gradient\n Descent Algorithms","summary":" Many machine learning tasks can be formulated as a stochastic compositional\noptimization (SCO) problem such as reinforcement learning, AUC maximization,\nand meta-learning, where the objective function involves a nested composition\nassociated with an expectation. While a significant amount of studies has been\ndevoted to studying the convergence behavior of SCO algorithms, there is little\nwork on understanding their generalization, i.e., how these learning algorithms\nbuilt from training examples would behave on future test examples. In this\npaper, we provide the stability and generalization analysis of stochastic\ncompositional gradient descent algorithms through the lens of algorithmic\nstability in the framework of statistical learning theory. Firstly, we\nintroduce a stability concept called compositional uniform stability and\nestablish its quantitative relation with generalization for SCO problems. Then,\nwe establish the compositional uniform stability results for two popular\nstochastic compositional gradient descent algorithms, namely SCGD and SCSC.\nFinally, we derive dimension-independent excess risk bounds for SCGD and SCSC\nby trade-offing their stability results and optimization errors. To the best of\nour knowledge, these are the first-ever-known results on stability and\ngeneralization analysis of stochastic compositional gradient descent\nalgorithms.\n","authors":["Ming Yang","Xiyuan Wei","Tianbao Yang","Yiming Ying"],"pdf_url":"https://arxiv.org/pdf/2307.03357v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13022v1","updated":"2023-11-21T22:05:00Z","published":"2023-11-21T22:05:00Z","title":"Unsupervised Multimodal Surface Registration with Geometric Deep\n Learning","summary":" This paper introduces GeoMorph, a novel geometric deep-learning framework\ndesigned for image registration of cortical surfaces. The registration process\nconsists of two main steps. First, independent feature extraction is performed\non each input surface using graph convolutions, generating low-dimensional\nfeature representations that capture important cortical surface\ncharacteristics. Subsequently, features are registered in a deep-discrete\nmanner to optimize the overlap of common structures across surfaces by learning\ndisplacements of a set of control points. To ensure smooth and biologically\nplausible deformations, we implement regularization through a deep conditional\nrandom field implemented with a recurrent neural network. Experimental results\ndemonstrate that GeoMorph surpasses existing deep-learning methods by achieving\nimproved alignment with smoother deformations. Furthermore, GeoMorph exhibits\ncompetitive performance compared to classical frameworks. Such versatility and\nrobustness suggest strong potential for various neuroscience applications.\n","authors":["Mohamed A. Suliman","Logan Z. J. Williams","Abdulah Fawaz","Emma C. Robinson"],"pdf_url":"https://arxiv.org/pdf/2311.13022v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13015v1","updated":"2023-11-21T21:44:28Z","published":"2023-11-21T21:44:28Z","title":"Fast and Interpretable Mortality Risk Scores for Critical Care Patients","summary":" Prediction of mortality in intensive care unit (ICU) patients is an important\ntask in critical care medicine. Prior work in creating mortality risk models\nfalls into two major categories: domain-expert-created scoring systems, and\nblack box machine learning (ML) models. Both of these have disadvantages: black\nbox models are unacceptable for use in hospitals, whereas manual creation of\nmodels (including hand-tuning of logistic regression parameters) relies on\nhumans to perform high-dimensional constrained optimization, which leads to a\nloss in performance. In this work, we bridge the gap between accurate black box\nmodels and hand-tuned interpretable models. We build on modern interpretable ML\ntechniques to design accurate and interpretable mortality risk scores. We\nleverage the largest existing public ICU monitoring datasets, namely the MIMIC\nIII and eICU datasets. By evaluating risk across medical centers, we are able\nto study generalization across domains. In order to customize our risk score\nmodels, we develop a new algorithm, GroupFasterRisk, which has several\nimportant benefits: (1) it uses hard sparsity constraint, allowing users to\ndirectly control the number of features; (2) it incorporates group sparsity to\nallow more cohesive models; (3) it allows for monotonicity correction on models\nfor including domain knowledge; (4) it produces many equally-good models at\nonce, which allows domain experts to choose among them. GroupFasterRisk creates\nits risk scores within hours, even on the large datasets we study here.\nGroupFasterRisk's risk scores perform better than risk scores currently used in\nhospitals, and have similar prediction performance to black box ML models\n(despite being much sparser). Because GroupFasterRisk produces a variety of\nrisk scores and handles constraints, it allows design flexibility, which is the\nkey enabler of practical and trustworthy model creation.\n","authors":["Chloe Qinyu Zhu","Muhang Tian","Lesia Semenova","Jiachang Liu","Jack Xu","Joseph Scarpa","Cynthia Rudin"],"pdf_url":"https://arxiv.org/pdf/2311.13015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10747v2","updated":"2023-11-21T21:40:21Z","published":"2023-10-31T18:21:24Z","title":"Safety-aware Causal Representation for Trustworthy Reinforcement\n Learning in Autonomous Driving","summary":" In the domain of autonomous driving, the Learning from Demonstration (LfD)\nparadigm has exhibited notable efficacy in addressing sequential\ndecision-making problems. However, consistently achieving safety in varying\ntraffic contexts, especially in safety-critical scenarios, poses a significant\nchallenge due to the long-tailed and unforeseen scenarios absent from offline\ndatasets. In this paper, we introduce the saFety-aware strUctured Scenario\nrepresentatION (FUSION), a pioneering methodology conceived to facilitate the\nlearning of an adaptive end-to-end driving policy by leveraging structured\nscenario information. FUSION capitalizes on the causal relationships between\ndecomposed reward, cost, state, and action space, constructing a framework for\nstructured sequential reasoning under dynamic traffic environments. We conduct\nrigorous evaluations in two typical real-world settings of distribution shift\nin autonomous vehicles, demonstrating the good balance between safety cost and\nutility reward of FUSION compared to contemporary state-of-the-art safety-aware\nLfD baselines. Empirical evidence under diverse driving scenarios attests that\nFUSION significantly enhances the safety and generalizability of autonomous\ndriving agents, even in the face of challenging and unseen environments.\nFurthermore, our ablation studies reveal noticeable improvements in the\nintegration of causal representation into the safe offline RL problem.\n","authors":["Haohong Lin","Wenhao Ding","Zuxin Liu","Yaru Niu","Jiacheng Zhu","Yuming Niu","Ding Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.10747v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.12751v1","updated":"2023-11-21T17:52:30Z","published":"2023-11-21T17:52:30Z","title":"Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with\n Spatially Relation Matching","summary":" Drone navigation through natural language commands remains a significant\nchallenge due to the lack of publicly available multi-modal datasets and the\nintricate demands of fine-grained visual-text alignment. In response to this\npressing need, we present a new human-computer interaction annotation benchmark\ncalled GeoText-1652, meticulously curated through a robust Large Language Model\n(LLM)-based data generation framework and the expertise of pre-trained vision\nmodels. This new dataset seamlessly extends the existing image dataset, \\ie,\nUniversity-1652, with spatial-aware text annotations, encompassing intricate\nimage-text-bounding box associations. Besides, we introduce a new optimization\nobjective to leverage fine-grained spatial associations, called blending\nspatial matching, for region-level spatial relation matching. Extensive\nexperiments reveal that our approach maintains an exceptional recall rate under\nvarying description complexities. This underscores the promising potential of\nour approach in elevating drone control and navigation through the seamless\nintegration of natural language commands in real-world scenarios.\n","authors":["Meng Chu","Zhedong Zheng","Wei Ji","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2311.12751v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.06255v2","updated":"2023-11-21T11:11:57Z","published":"2023-09-12T14:16:34Z","title":"Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation","summary":" One primary topic of multi-modal learning is to jointly incorporate\nheterogeneous information from different modalities. However, most models often\nsuffer from unsatisfactory multi-modal cooperation, which could not jointly\nutilize all modalities well. Some methods are proposed to identify and enhance\nthe worse learnt modality, but are often hard to provide the fine-grained\nobservation of multi-modal cooperation at sample-level with theoretical\nsupport. Hence, it is essential to reasonably observe and improve the\nfine-grained cooperation between modalities, especially when facing realistic\nscenarios where the modality discrepancy could vary across different samples.\nTo this end, we introduce a fine-grained modality valuation metric to evaluate\nthe contribution of each modality at sample-level. Via modality valuation, we\nregretfully observe that the multi-modal model tends to rely on one specific\nmodality, resulting in other modalities being low-contributing. We further\nanalyze this issue and improve cooperation between modalities by enhancing the\ndiscriminative ability of low-contributing modalities in a targeted manner.\nOverall, our methods reasonably observe the fine-grained uni-modal contribution\nat sample-level and achieve considerable improvement on different multi-modal\nmodels.\n","authors":["Yake Wei","Ruoxuan Feng","Zihe Wang","Di Hu"],"pdf_url":"https://arxiv.org/pdf/2309.06255v2.pdf","comment":"7 pages"},{"id":"http://arxiv.org/abs/2311.12454v1","updated":"2023-11-21T09:07:11Z","published":"2023-11-21T09:07:11Z","title":"HierSpeech++: Bridging the Gap between Semantic and Acoustic\n Representation of Speech by Hierarchical Variational Inference for Zero-shot\n Speech Synthesis","summary":" Large language models (LLM)-based speech synthesis has been widely adopted in\nzero-shot speech synthesis. However, they require a large-scale data and\npossess the same limitations as previous autoregressive speech models,\nincluding slow inference speed and lack of robustness. This paper proposes\nHierSpeech++, a fast and strong zero-shot speech synthesizer for text-to-speech\n(TTS) and voice conversion (VC). We verified that hierarchical speech synthesis\nframeworks could significantly improve the robustness and expressiveness of the\nsynthetic speech. Furthermore, we significantly improve the naturalness and\nspeaker similarity of synthetic speech even in zero-shot speech synthesis\nscenarios. For text-to-speech, we adopt the text-to-vec framework, which\ngenerates a self-supervised speech representation and an F0 representation\nbased on text representations and prosody prompts. Then, HierSpeech++ generates\nspeech from the generated vector, F0, and voice prompt. We further introduce a\nhigh-efficient speech super-resolution framework from 16 kHz to 48 kHz. The\nexperimental results demonstrated that the hierarchical variational autoencoder\ncould be a strong zero-shot speech synthesizer given that it outperforms\nLLM-based and diffusion-based models. Moreover, we achieved the first\nhuman-level quality zero-shot speech synthesis. Audio samples and source code\nare available at https://github.com/sh-lee-prml/HierSpeechpp.\n","authors":["Sang-Hoon Lee","Ha-Yeong Choi","Seung-Bin Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2311.12454v1.pdf","comment":"16 pages, 9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2311.12401v1","updated":"2023-11-21T07:28:51Z","published":"2023-11-21T07:28:51Z","title":"CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal\n Relationships","summary":" Integrating deep learning and causal discovery has increased the\ninterpretability of Temporal Action Segmentation (TAS) tasks. However,\nframe-level causal relationships exist many complicated noises outside the\nsegment-level, making it infeasible to directly express macro action semantics.\nThus, we propose \\textit{\\textbf{Causal Abstraction Segmentation Refiner\n(CASR)}}, which can refine TAS results from various models by enhancing video\ncausality in marginalizing frame-level casual relationships. Specifically, we\ndefine the equivalent frame-level casual model and segment-level causal model,\nso that the causal adjacency matrix constructed from marginalized frame-level\ncausal relationships has the ability to represent the segmnet-level causal\nrelationships. CASR works out by reducing the difference in the causal\nadjacency matrix between we constructed and pre-segmentation results of\nbackbone models. In addition, we propose a novel evaluation metric Causal Edit\nDistance (CED) to evaluate the causal interpretability. Extensive experimental\nresults on mainstream datasets indicate that CASR significantly surpasses\nexisting various methods in action segmentation performance, as well as in\ncausal explainability and generalization. Our code will be available soon.\n","authors":["Keqing Du","Xinyu Yang","Hang Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12257v1","updated":"2023-11-21T00:37:47Z","published":"2023-11-21T00:37:47Z","title":"Equipping Pretrained Unconditional Music Transformers with Instrument\n and Genre Controls","summary":" The ''pretraining-and-finetuning'' paradigm has become a norm for training\ndomain-specific models in natural language processing and computer vision. In\nthis work, we aim to examine this paradigm for symbolic music generation\nthrough leveraging the largest ever symbolic music dataset sourced from the\nMuseScore forum. We first pretrain a large unconditional transformer model\nusing 1.5 million songs. We then propose a simple technique to equip this\npretrained unconditional music transformer model with instrument and genre\ncontrols by finetuning the model with additional control tokens. Our proposed\nrepresentation offers improved high-level controllability and expressiveness\nagainst two existing representations. The experimental results show that the\nproposed model can successfully generate music with user-specified instruments\nand genre. In a subjective listening test, the proposed model outperforms the\npretrained baseline model in terms of coherence, harmony, arrangement and\noverall quality.\n","authors":["Weihan Xu","Julian McAuley","Shlomo Dubnov","Hao-Wen Dong"],"pdf_url":"https://arxiv.org/pdf/2311.12257v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10256v2","updated":"2023-11-21T23:52:32Z","published":"2023-11-17T00:56:55Z","title":"Exploring User Perceptions of Virtual Reality Scene Design in Metaverse\n Learning Environments","summary":" Metaverse learning environments allow for a seamless and intuitive transition\nbetween activities compared to Virtual Reality (VR) learning environments, due\nto their interconnected design. The design of VR scenes is important for\ncreating effective learning experiences in the Metaverse. However, there is\nlimited research on the impact of different design elements on user's learning\nexperiences in VR scenes. To address this, a study was conducted with 16\nparticipants who interacted with two VR scenes, each with varying design\nelements such as style, color, texture, object, and background, while watching\na short tutorial. Participant rankings of the scenes for learning were obtained\nusing a seven-point Likert scale, and the Mann-Whitney U test was used to\nvalidate differences in preference between the scenes. The results showed a\nsignificant difference in preference between the scenes. Further analysis using\nthe NASA TLX questionnaire was conducted to examine the impact of this\ndifference on cognitive load, and participant feedback was also considered. The\nstudy emphasizes the importance of careful VR scene design to improve the\nuser's learning experience.\n","authors":["Rahatara Ferdousi","Mohammed Faisal","Fedwa Laamarti","Chunsheng Yang","Abdulmotaleb El Saddik"],"pdf_url":"https://arxiv.org/pdf/2311.10256v2.pdf","comment":"6 pages,3 figures, accepted to present at IEEE 42nd International\n Conference on Consumer Electronics"},{"id":"http://arxiv.org/abs/2311.12894v1","updated":"2023-11-21T08:20:38Z","published":"2023-11-21T08:20:38Z","title":"Attribute-Aware Deep Hashing with Self-Consistency for Large-Scale\n Fine-Grained Image Retrieval","summary":" Our work focuses on tackling large-scale fine-grained image retrieval as\nranking the images depicting the concept of interests (i.e., the same\nsub-category labels) highest based on the fine-grained details in the query. It\nis desirable to alleviate the challenges of both fine-grained nature of small\ninter-class variations with large intra-class variations and explosive growth\nof fine-grained data for such a practical task. In this paper, we propose\nattribute-aware hashing networks with self-consistency for generating\nattribute-aware hash codes to not only make the retrieval process efficient,\nbut also establish explicit correspondences between hash codes and visual\nattributes. Specifically, based on the captured visual representations by\nattention, we develop an encoder-decoder structure network of a reconstruction\ntask to unsupervisedly distill high-level attribute-specific vectors from the\nappearance-specific visual representations without attribute annotations. Our\nmodels are also equipped with a feature decorrelation constraint upon these\nattribute vectors to strengthen their representative abilities. Then, driven by\npreserving original entities' similarity, the required hash codes can be\ngenerated from these attribute-specific vectors and thus become\nattribute-aware. Furthermore, to combat simplicity bias in deep hashing, we\nconsider the model design from the perspective of the self-consistency\nprinciple and propose to further enhance models' self-consistency by equipping\nan additional image reconstruction path. Comprehensive quantitative experiments\nunder diverse empirical settings on six fine-grained retrieval datasets and two\ngeneric retrieval datasets show the superiority of our models over competing\nmethods.\n","authors":["Xiu-Shen Wei","Yang Shen","Xuhao Sun","Peng Wang","Yuxin Peng"],"pdf_url":"https://arxiv.org/pdf/2311.12894v1.pdf","comment":"Accepted by IEEE TPAMI"}]},"2023-11-22T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.13581v1","updated":"2023-11-22T18:37:27Z","published":"2023-11-22T18:37:27Z","title":"PaSS: Parallel Speculative Sampling","summary":" Scaling the size of language models to tens of billions of parameters has led\nto impressive performance on a wide range of tasks. At generation, these models\nare used auto-regressively, requiring a forward pass for each generated token,\nand thus reading the full set of parameters from memory. This memory access\nforms the primary bottleneck for generation and it worsens as the model size\nincreases. Moreover, executing a forward pass for multiple tokens in parallel\noften takes nearly the same time as it does for just one token. These two\nobservations lead to the development of speculative sampling, where a second\nsmaller model is used to draft a few tokens, that are then validated or\nrejected using a single forward pass of the large model. Unfortunately, this\nmethod requires two models that share the same tokenizer and thus limits its\nadoption. As an alternative, we propose to use parallel decoding as a way to\ndraft multiple tokens from a single model with no computational cost, nor the\nneed for a second model. Our approach only requires an additional input token\nthat marks the words that will be generated simultaneously. We show promising\nperformance (up to $30\\%$ speed-up) while requiring only as few as $O(d_{emb})$\nadditional parameters.\n","authors":["Giovanni Monea","Armand Joulin","Edouard Grave"],"pdf_url":"https://arxiv.org/pdf/2311.13581v1.pdf","comment":"Accepted at the 3rd workshop on Efficient Natural Language and Speech\n Processing (ENLSP, NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2311.13565v1","updated":"2023-11-22T18:22:56Z","published":"2023-11-22T18:22:56Z","title":"Drilling Down into the Discourse Structure with LLMs for Long Document\n Question Answering","summary":" We address the task of evidence retrieval for long document question\nanswering, which involves locating relevant paragraphs within a document to\nanswer a question. We aim to assess the applicability of large language models\n(LLMs) in the task of zero-shot long document evidence retrieval, owing to\ntheir unprecedented performance across various NLP tasks. However, currently\nthe LLMs can consume limited context lengths as input, thus providing document\nchunks as inputs might overlook the global context while missing out on\ncapturing the inter-segment dependencies. Moreover, directly feeding the large\ninput sets can incur significant computational costs, particularly when\nprocessing the entire document (and potentially incurring monetary expenses\nwith enterprise APIs like OpenAI's GPT variants). To address these challenges,\nwe propose a suite of techniques that exploit the discourse structure commonly\nfound in documents. By utilizing this structure, we create a condensed\nrepresentation of the document, enabling a more comprehensive understanding and\nanalysis of relationships between different parts. We retain $99.6\\%$ of the\nbest zero-shot approach's performance, while processing only $26\\%$ of the\ntotal tokens used by the best approach in the information seeking evidence\nretrieval setup. We also show how our approach can be combined with\n\\textit{self-ask} reasoning agent to achieve best zero-shot performance in\ncomplex multi-hop question answering, just $\\approx 4\\%$ short of zero-shot\nperformance using gold evidence.\n","authors":["Inderjeet Nair","Shwetha Somasundaram","Apoorv Saxena","Koustava Goswami"],"pdf_url":"https://arxiv.org/pdf/2311.13565v1.pdf","comment":"Accepted to the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.13534v1","updated":"2023-11-22T17:14:54Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose a novel method which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging (namely LM-Cocktail), where the fine-tuned language model\nis merged with the pre-trained base model or the peer models from other domains\nthrough weighted average. Despite simplicity, LM-Cocktail is surprisingly\neffective: the resulted model is able to achieve a strong empirical performance\nin the whole scope of general tasks while preserving a superior capacity in its\ntargeted domain. We conduct comprehensive experiments with LLama and BGE model\non popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.12261v4","updated":"2023-11-22T16:59:55Z","published":"2022-07-06T13:56:48Z","title":"GraphCFC: A Directed Graph Based Cross-Modal Feature Complementation\n Approach for Multimodal Conversational Emotion Recognition","summary":" Emotion Recognition in Conversation (ERC) plays a significant part in\nHuman-Computer Interaction (HCI) systems since it can provide empathetic\nservices. Multimodal ERC can mitigate the drawbacks of uni-modal approaches.\nRecently, Graph Neural Networks (GNNs) have been widely used in a variety of\nfields due to their superior performance in relation modeling. In multimodal\nERC, GNNs are capable of extracting both long-distance contextual information\nand inter-modal interactive information. Unfortunately, since existing methods\nsuch as MMGCN directly fuse multiple modalities, redundant information may be\ngenerated and diverse information may be lost. In this work, we present a\ndirected Graph based Cross-modal Feature Complementation (GraphCFC) module that\ncan efficiently model contextual and interactive information. GraphCFC\nalleviates the problem of heterogeneity gap in multimodal fusion by utilizing\nmultiple subspace extractors and Pair-wise Cross-modal Complementary (PairCC)\nstrategy. We extract various types of edges from the constructed graph for\nencoding, thus enabling GNNs to extract crucial contextual and interactive\ninformation more accurately when performing message passing. Furthermore, we\ndesign a GNN structure called GAT-MLP, which can provide a new unified network\nframework for multimodal learning. The experimental results on two benchmark\ndatasets show that our GraphCFC outperforms the state-of-the-art (SOTA)\napproaches.\n","authors":["Jiang Li","Xiaoping Wang","Guoqing Lv","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2207.12261v4.pdf","comment":"Accepted by IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2311.13495v1","updated":"2023-11-22T16:12:42Z","published":"2023-11-22T16:12:42Z","title":"Current Topological and Machine Learning Applications for Bias Detection\n in Text","summary":" Institutional bias can impact patient outcomes, educational attainment, and\nlegal system navigation. Written records often reflect bias, and once bias is\nidentified; it is possible to refer individuals for training to reduce bias.\nMany machine learning tools exist to explore text data and create predictive\nmodels that can search written records to identify real-time bias. However, few\nprevious studies investigate large language model embeddings and geometric\nmodels of biased text data to understand geometry's impact on bias modeling\naccuracy. To overcome this issue, this study utilizes the RedditBias database\nto analyze textual biases. Four transformer models, including BERT and RoBERTa\nvariants, were explored. Post-embedding, t-SNE allowed two-dimensional\nvisualization of data. KNN classifiers differentiated bias types, with lower\nk-values proving more effective. Findings suggest BERT, particularly mini BERT,\nexcels in bias classification, while multilingual models lag. The\nrecommendation emphasizes refining monolingual models and exploring\ndomain-specific biases.\n","authors":["Colleen Farrelly","Yashbir Singh","Quincy A. Hathaway","Gunnar Carlsson","Ashok Choudhary","Rahul Paul","Gianfranco Doretto","Yassine Himeur","Shadi Atalls","Wathiq Mansoor"],"pdf_url":"https://arxiv.org/pdf/2311.13495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19680v3","updated":"2023-11-22T16:12:39Z","published":"2023-10-30T16:00:13Z","title":"Integrating Pre-trained Language Model into Neural Machine Translation","summary":" Neural Machine Translation (NMT) has become a significant technology in\nnatural language processing through extensive research and development.\nHowever, the deficiency of high-quality bilingual language pair data still\nposes a major challenge to improving NMT performance. Recent studies have been\nexploring the use of contextual information from pre-trained language model\n(PLM) to address this problem. Yet, the issue of incompatibility between PLM\nand NMT model remains unresolved. This study proposes PLM-integrated NMT\n(PiNMT) model to overcome the identified problems. PiNMT model consists of\nthree critical components, PLM Multi Layer Converter, Embedding Fusion, and\nCosine Alignment, each playing a vital role in providing effective PLM\ninformation to NMT. Furthermore, two training strategies, Separate Learning\nRates and Dual Step Training, are also introduced in this paper. By\nimplementing the proposed PiNMT model and training strategy, we achieve\nstate-of-the-art performance on the IWSLT'14 En$\\leftrightarrow$De dataset.\nThis study's outcomes are noteworthy as they demonstrate a novel approach for\nefficiently integrating PLM with NMT to overcome incompatibility and enhance\nperformance.\n","authors":["Soon-Jae Hwang","Chang-Sung Jeong"],"pdf_url":"https://arxiv.org/pdf/2310.19680v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00449v2","updated":"2023-11-22T15:43:23Z","published":"2023-07-02T01:25:47Z","title":"A Dual-Stream Recurrence-Attention Network With Global-Local Awareness\n for Emotion Recognition in Textual Dialog","summary":" In real-world dialog systems, the ability to understand the user's emotions\nand interact anthropomorphically is of great significance. Emotion Recognition\nin Conversation (ERC) is one of the key ways to accomplish this goal and has\nattracted growing attention. How to model the context in a conversation is a\ncentral aspect and a major challenge of ERC tasks. Most existing approaches\nstruggle to adequately incorporate both global and local contextual\ninformation, and their network structures are overly sophisticated. For this\nreason, we propose a simple and effective Dual-stream Recurrence-Attention\nNetwork (DualRAN), which is based on Recurrent Neural Network (RNN) and\nMulti-head ATtention network (MAT). DualRAN eschews the complex components of\ncurrent methods and focuses on combining recurrence-based methods with\nattention-based ones. DualRAN is a dual-stream structure mainly consisting of\nlocal- and global-aware modules, modeling a conversation simultaneously from\ndistinct perspectives. In addition, we develop two single-stream network\nvariants for DualRAN, i.e., SingleRANv1 and SingleRANv2. According to the\nexperimental findings, DualRAN boosts the weighted F1 scores by 1.43% and 0.64%\non the IEMOCAP and MELD datasets, respectively, in comparison to the strongest\nbaseline. On two other datasets (i.e., EmoryNLP and DailyDialog), our method\nalso attains competitive results.\n","authors":["Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2307.00449v2.pdf","comment":"Accepted by Engineering Applications of Artificial Intelligence\n (EAAI)"},{"id":"http://arxiv.org/abs/2311.13475v1","updated":"2023-11-22T15:42:51Z","published":"2023-11-22T15:42:51Z","title":"Machine Translation to Control Formality Features in the Target Language","summary":" Formality plays a significant role in language communication, especially in\nlow-resource languages such as Hindi, Japanese and Korean. These languages\nutilise formal and informal expressions to convey messages based on social\ncontexts and relationships. When a language translation technique is used to\ntranslate from a source language that does not pertain the formality (e.g.\nEnglish) to a target language that does, there is a missing information on\nformality that could be a challenge in producing an accurate outcome. This\nresearch explores how this issue should be resolved when machine learning\nmethods are used to translate from English to languages with formality, using\nHindi as the example data. This was done by training a bilingual model in a\nformality-controlled setting and comparing its performance with a pre-trained\nmultilingual model in a similar setting. Since there are not a lot of training\ndata with ground truth, automated annotation techniques were employed to\nincrease the data size. The primary modeling approach involved leveraging\ntransformer models, which have demonstrated effectiveness in various natural\nlanguage processing tasks. We evaluate the official formality accuracy(ACC) by\ncomparing the predicted masked tokens with the ground truth. This metric\nprovides a quantitative measure of how well the translations align with the\ndesired outputs. Our study showcases a versatile translation strategy that\nconsiders the nuances of formality in the target language, catering to diverse\nlanguage communication needs and scenarios.\n","authors":["Harshita Tyagi","Prashasta Jung","Hyowon Lee"],"pdf_url":"https://arxiv.org/pdf/2311.13475v1.pdf","comment":"9 pages, based on DCU MCM Practicum 2022/2023"},{"id":"http://arxiv.org/abs/2311.13472v1","updated":"2023-11-22T15:40:57Z","published":"2023-11-22T15:40:57Z","title":"Complexity-Guided Curriculum Learning for Text Graphs","summary":" Curriculum learning provides a systematic approach to training. It refines\ntraining progressively, tailors training to task requirements, and improves\ngeneralization through exposure to diverse examples. We present a curriculum\nlearning approach that builds on existing knowledge about text and graph\ncomplexity formalisms for training with text graph data. The core part of our\napproach is a novel data scheduler, which employs \"spaced repetition\" and\ncomplexity formalisms to guide the training process. We demonstrate the\neffectiveness of the proposed approach on several text graph tasks and graph\nneural network architectures. The proposed model gains more and uses less data;\nconsistently prefers text over graph complexity indices throughout training,\nwhile the best curricula derived from text and graph complexity indices are\nequally effective; and it learns transferable curricula across GNN models and\ndatasets. In addition, we find that both node-level (local) and graph-level\n(global) graph complexity indices, as well as shallow and traditional text\ncomplexity indices play a crucial role in effective curriculum learning.\n","authors":["Nidhi Vakil","Hadi Amiri"],"pdf_url":"https://arxiv.org/pdf/2311.13472v1.pdf","comment":"Long Paper Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.13455v1","updated":"2023-11-22T15:22:04Z","published":"2023-11-22T15:22:04Z","title":"Generation of Explanations for Logic Reasoning","summary":" This thesis delves into a fortiori arguments in deductive reasoning,\nunderscoring their relevance in various domains such as law, philosophy, and\nartificial intelligence. The research is centred on employing GPT-3.5-turbo to\nautomate the analysis of these arguments, with a focus on understanding\nintricate reasoning processes, generating clear and coherent explanations, and\ncreating novel arguments. The methodology encompasses a series of tasks\nincluding detailed reasoning, interpretation, and the augmentation of a\nfortiori arguments. It involves meticulously identifying these arguments in\ndiverse contexts, differentiating comparative elements, and categorizing them\nbased on their logical structure.\n Extensive experiments reveals the challenges encountered by GPT-3.5-turbo in\naccurately detecting and classifying a fortiori arguments. Nevertheless, the\nmodel demonstrates a performance that rivals specialized models, particularly\nin extracting key components and interpreting underlying properties. The\nintegration of external information into the model's processing significantly\nelevates the quality of the generated explanations. Additionally, the model\nexhibits a noteworthy capability in augmenting arguments, thus contributing to\nthe enrichment of the data set.\n Despite facing certain limitations, this thesis makes significant\ncontributions to the fields of artificial intelligence and logical reasoning.\nIt introduces novel methodologies, establishes a rigorous evaluation framework,\nand provides deep insights that set the stage for future advancements in\nautomated logical reasoning. The findings and methodologies presented herein\nnot only underscore the potential of AI in complex reasoning tasks but also\nhighlight areas for future research and development.\n","authors":["Yanyi Pu"],"pdf_url":"https://arxiv.org/pdf/2311.13455v1.pdf","comment":"78 Pages, 16 Figures, Thesis Presentation is available at\n https://drive.google.com/file/d/1wLIBsjfLvO11PjCS6qx4Y9UgRBUfq3wQ/view?usp=sharing"},{"id":"http://arxiv.org/abs/2311.13350v1","updated":"2023-11-22T12:39:28Z","published":"2023-11-22T12:39:28Z","title":"Fact-based Court Judgment Prediction","summary":" This extended abstract extends the research presented in \"ILDC for CJPE:\nIndian Legal Documents Corpus for Court Judgment Prediction and Explanation\"\n\\cite{malik-etal-2021-ildc}, focusing on fact-based judgment prediction within\nthe context of Indian legal documents. We introduce two distinct problem\nvariations: one based solely on facts, and another combining facts with rulings\nfrom lower courts (RLC). Our research aims to enhance early-phase case outcome\nprediction, offering significant benefits to legal professionals and the\ngeneral public. The results, however, indicated a performance decline compared\nto the original ILDC for CJPE study, even after implementing various weightage\nschemes in our DELSumm algorithm. Additionally, using only facts for legal\njudgment prediction with different transformer models yielded results inferior\nto the state-of-the-art outcomes reported in the \"ILDC for CJPE\" study.\n","authors":["Shubham Kumar Nigam","Aniket Deroy"],"pdf_url":"https://arxiv.org/pdf/2311.13350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13314v1","updated":"2023-11-22T11:08:38Z","published":"2023-11-22T11:08:38Z","title":"Mitigating Large Language Model Hallucinations via Autonomous Knowledge\n Graph-based Retrofitting","summary":" Incorporating factual knowledge in knowledge graph is regarded as a promising\napproach for mitigating the hallucination of large language models (LLMs).\nExisting methods usually only use the user's input to query the knowledge\ngraph, thus failing to address the factual hallucination generated by LLMs\nduring its reasoning process. To address this problem, this paper proposes\nKnowledge Graph-based Retrofitting (KGR), a new framework that incorporates\nLLMs with KGs to mitigate factual hallucination during the reasoning process by\nretrofitting the initial draft responses of LLMs based on the factual knowledge\nstored in KGs. Specifically, KGR leverages LLMs to extract, select, validate,\nand retrofit factual statements within the model-generated responses, which\nenables an autonomous knowledge verifying and refining procedure without any\nadditional manual efforts. Experiments show that KGR can significantly improve\nthe performance of LLMs on factual QA benchmarks especially when involving\ncomplex reasoning processes, which demonstrates the necessity and effectiveness\nof KGR in mitigating hallucination and enhancing the reliability of LLMs.\n","authors":["Xinyan Guan","Yanjiang Liu","Hongyu Lin","Yaojie Lu","Ben He","Xianpei Han","Le Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13307v1","updated":"2023-11-22T10:55:36Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Reasoning and\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as an interaction between\nvision and language fields. Previous works inherited the ideology of\nvision-to-language generation tasks,aiming to generate paragraphs with high\nconsistency as reports. However, one unique characteristic of RRG, the\nindependence between diseases, was neglected, leading to the injection of the\nspurious confounder, i.e., the disease co-occurrence. Unfortunately, this\nconfounder confuses the process of report generation worse because of the\nbiased RRG data distribution. In this paper, to rethink this issue thoroughly,\nwe reason about its causes and effects from a novel perspective of statistics\nand causality, where the Joint Vision Coupling and the Conditional Sentence\nCoherence Coupling are two aspects prone to implicitly decrease the accuracy of\nreports. Then, a counterfactual augmentation strategy that contains the\nCounterfactual Sample Synthesis and the Counterfactual Report Reconstruction\nsub-methods is proposed to break these two aspects of spurious effects.\nExperimental results and further analyses on two widely used datasets justify\nour reasoning and proposed methods.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v1.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2305.14264v2","updated":"2023-11-22T10:22:18Z","published":"2023-05-23T17:16:04Z","title":"Active Learning Principles for In-Context Learning with Large Language\n Models","summary":" The remarkable advancements in large language models (LLMs) have\nsignificantly enhanced the performance in few-shot learning settings. By using\nonly a small number of labeled examples, referred to as demonstrations, LLMs\ncan effectively grasp the task at hand through in-context learning. However,\nthe process of selecting appropriate demonstrations has received limited\nattention in prior work. This paper addresses the issue of identifying the most\ninformative demonstrations for few-shot learning by approaching it as a\npool-based Active Learning (AL) problem over a single iteration. Our objective\nis to investigate how AL algorithms can serve as effective demonstration\nselection methods for in-context learning. We compare various standard AL\nalgorithms based on uncertainty, diversity, and similarity, and consistently\nobserve that the latter outperforms all other methods, including random\nsampling. Notably, uncertainty sampling, despite its success in conventional\nsupervised learning scenarios, performs poorly in this context. Our extensive\nexperimentation involving a diverse range of GPT and OPT models across $24$\nclassification and multi-choice tasks, coupled with thorough analysis,\nunambiguously demonstrates that in-context example selection through AL\nprioritizes high-quality examples that exhibit low uncertainty and bear\nsimilarity to the test examples.\n","authors":["Katerina Margatina","Timo Schick","Nikolaos Aletras","Jane Dwivedi-Yu"],"pdf_url":"https://arxiv.org/pdf/2305.14264v2.pdf","comment":"To appear at Findings of EMNLP (Camera Ready version)"},{"id":"http://arxiv.org/abs/2311.13281v1","updated":"2023-11-22T10:04:29Z","published":"2023-11-22T10:04:29Z","title":"Intention and Context Elicitation with Large Language Models in the\n Legal Aid Intake Process","summary":" Large Language Models (LLMs) and chatbots show significant promise in\nstreamlining the legal intake process. This advancement can greatly reduce the\nworkload and costs for legal aid organizations, improving availability while\nmaking legal assistance more accessible to a broader audience. However, a key\nchallenge with current LLMs is their tendency to overconfidently deliver an\nimmediate 'best guess' to a client's question based on the output distribution\nlearned over the training data. This approach often overlooks the client's\nactual intentions or the specifics of their legal situation. As a result,\nclients may not realize the importance of providing essential additional\ncontext or expressing their underlying intentions, which are crucial for their\nlegal cases. Traditionally, logic based decision trees have been used to\nautomate intake for specific access to justice issues, such as immigration and\neviction. But those solutions lack scalability. We demonstrate a\nproof-of-concept using LLMs to elicit and infer clients' underlying intentions\nand specific legal circumstances through free-form, language-based\ninteractions. We also propose future research directions to use supervised\nfine-tuning or offline reinforcement learning to automatically incorporate\nintention and context elicitation in chatbots without explicit prompting.\n","authors":["Nick Goodson","Rongfei Lu"],"pdf_url":"https://arxiv.org/pdf/2311.13281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13274v1","updated":"2023-11-22T09:51:53Z","published":"2023-11-22T09:51:53Z","title":"Enhancing Summarization Performance through Transformer-Based Prompt\n Engineering in Automated Medical Reporting","summary":" Customized medical prompts enable Large Language Models (LLM) to effectively\naddress medical dialogue summarization. The process of medical reporting is\noften time-consuming for healthcare professionals. Implementing medical\ndialogue summarization techniques presents a viable solution to alleviate this\ntime constraint by generating automated medical reports. The effectiveness of\nLLMs in this process is significantly influenced by the formulation of the\nprompt, which plays a crucial role in determining the quality and relevance of\nthe generated reports. In this research, we used a combination of two distinct\nprompting strategies, known as shot prompting and pattern prompting to enhance\nthe performance of automated medical reporting. The evaluation of the automated\nmedical reports is carried out using the ROUGE score and a human evaluation\nwith the help of an expert panel. The two-shot prompting approach in\ncombination with scope and domain context outperforms other methods and\nachieves the highest score when compared to the human reference set by a\ngeneral practitioner. However, the automated reports are approximately twice as\nlong as the human references, due to the addition of both redundant and\nrelevant statements that are added to the report.\n","authors":["Daphne van Zandvoort","Laura Wiersema","Tom Huibers","Sandra van Dulmen","Sjaak Brinkkemper"],"pdf_url":"https://arxiv.org/pdf/2311.13274v1.pdf","comment":"12 pages, 4 figures, submitted to Healthinf 2024, author roles:\n research conducted and written by Daphne van Zandvoort and Laura Wiersema,\n research suggested and used software created by Tom Huibers, data provided\n and feedback provided by Sandra van Dulmen, supervision and feedback provided\n by Sjaak Brinkkemper"},{"id":"http://arxiv.org/abs/2311.13273v1","updated":"2023-11-22T09:51:43Z","published":"2023-11-22T09:51:43Z","title":"Comparative Experimentation of Accuracy Metrics in Automated Medical\n Reporting: The Case of Otitis Consultations","summary":" Generative Artificial Intelligence (AI) can be used to automatically generate\nmedical reports based on transcripts of medical consultations. The aim is to\nreduce the administrative burden that healthcare professionals face. The\naccuracy of the generated reports needs to be established to ensure their\ncorrectness and usefulness. There are several metrics for measuring the\naccuracy of AI generated reports, but little work has been done towards the\napplication of these metrics in medical reporting. A comparative\nexperimentation of 10 accuracy metrics has been performed on AI generated\nmedical reports against their corresponding General Practitioner's (GP) medical\nreports concerning Otitis consultations. The number of missing, incorrect, and\nadditional statements of the generated reports have been correlated with the\nmetric scores. In addition, we introduce and define a Composite Accuracy Score\nwhich produces a single score for comparing the metrics within the field of\nautomated medical reporting. Findings show that based on the correlation study\nand the Composite Accuracy Score, the ROUGE-L and Word Mover's Distance metrics\nare the preferred metrics, which is not in line with previous work. These\nfindings help determine the accuracy of an AI generated medical report, which\naids the development of systems that generate medical reports for GPs to reduce\nthe administrative burden.\n","authors":["Wouter Faber","Renske Eline Bootsma","Tom Huibers","Sandra van Dulmen","Sjaak Brinkkemper"],"pdf_url":"https://arxiv.org/pdf/2311.13273v1.pdf","comment":"10 pages, 1 figure, submitted to HEALTHINF 2024, Author\n contributions: Wouter Faber and Renske Eline Bootsma performed research and\n wrote paper, Tom Huibers provided needed software and research inspiration,\n Sandra van Dulmen provided the data and feedback on paper, Sjaak Brinkkemper\n supervised the project and provided continuous feedback"},{"id":"http://arxiv.org/abs/2311.13258v1","updated":"2023-11-22T09:23:34Z","published":"2023-11-22T09:23:34Z","title":"ViStruct: Visual Structural Knowledge Extraction via Curriculum Guided\n Code-Vision Representation","summary":" State-of-the-art vision-language models (VLMs) still have limited performance\nin structural knowledge extraction, such as relations between objects. In this\nwork, we present ViStruct, a training framework to learn VLMs for effective\nvisual structural knowledge extraction. Two novel designs are incorporated.\nFirst, we propose to leverage the inherent structure of programming language to\ndepict visual structural information. This approach enables explicit and\nconsistent representation of visual structural information of multiple\ngranularities, such as concepts, relations, and events, in a well-organized\nstructured format. Second, we introduce curriculum-based learning for VLMs to\nprogressively comprehend visual structures, from fundamental visual concepts to\nintricate event structures. Our intuition is that lower-level knowledge may\ncontribute to complex visual structure understanding. Furthermore, we compile\nand release a collection of datasets tailored for visual structural knowledge\nextraction. We adopt a weakly-supervised approach to directly generate visual\nevent structures from captions for ViStruct training, capitalizing on abundant\nimage-caption pairs from the web. In experiments, we evaluate ViStruct on\nvisual structure prediction tasks, demonstrating its effectiveness in improving\nthe understanding of visual structures. The code is public at\n\\url{https://github.com/Yangyi-Chen/vi-struct}.\n","authors":["Yangyi Chen","Xingyao Wang","Manling Li","Derek Hoiem","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2311.13258v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.00321v2","updated":"2023-11-22T09:08:03Z","published":"2023-11-01T06:09:54Z","title":"HARE: Explainable Hate Speech Detection with Step-by-Step Reasoning","summary":" With the proliferation of social media, accurate detection of hate speech has\nbecome critical to ensure safety online. To combat nuanced forms of hate\nspeech, it is important to identify and thoroughly explain hate speech to help\nusers understand its harmful effects. Recent benchmarks have attempted to\ntackle this issue by training generative models on free-text annotations of\nimplications in hateful text. However, we find significant reasoning gaps in\nthe existing annotations schemes, which may hinder the supervision of detection\nmodels. In this paper, we introduce a hate speech detection framework, HARE,\nwhich harnesses the reasoning capabilities of large language models (LLMs) to\nfill these gaps in explanations of hate speech, thus enabling effective\nsupervision of detection models. Experiments on SBIC and Implicit Hate\nbenchmarks show that our method, using model-generated data, consistently\noutperforms baselines, using existing free-text human annotations. Analysis\ndemonstrates that our method enhances the explanation quality of trained models\nand improves generalization to unseen datasets. Our code is available at\nhttps://github.com/joonkeekim/hare-hate-speech.git.\n","authors":["Yongjin Yang","Joonkee Kim","Yujin Kim","Namgyu Ho","James Thorne","Se-young Yun"],"pdf_url":"https://arxiv.org/pdf/2311.00321v2.pdf","comment":"Findings of EMNLP 2023; The first three authors contribute equally"},{"id":"http://arxiv.org/abs/2311.13246v1","updated":"2023-11-22T09:04:57Z","published":"2023-11-22T09:04:57Z","title":"Automatic Instruction Optimization for Open-source LLM Instruction\n Tuning","summary":" Instruction tuning is crucial for enabling Language Learning Models (LLMs) in\nresponding to human instructions. The quality of instruction pairs used for\ntuning greatly affects the performance of LLMs. However, the manual creation of\nhigh-quality instruction datasets is costly, leading to the adoption of\nautomatic generation of instruction pairs by LLMs as a popular alternative in\nthe training of open-source LLMs. To ensure the high quality of LLM-generated\ninstruction datasets, several approaches have been proposed. Nevertheless,\nexisting methods either compromise dataset integrity by filtering a large\nproportion of samples, or are unsuitable for industrial applications. In this\npaper, instead of discarding low-quality samples, we propose CoachLM, a novel\napproach to enhance the quality of instruction datasets through automatic\nrevisions on samples in the dataset. CoachLM is trained from the samples\nrevised by human experts and significantly increases the proportion of\nhigh-quality samples in the dataset from 17.7% to 78.9%. The effectiveness of\nCoachLM is further assessed on various real-world instruction test sets. The\nresults show that CoachLM improves the instruction-following capabilities of\nthe instruction-tuned LLM by an average of 29.9%, which even surpasses larger\nLLMs with nearly twice the number of parameters. Furthermore, CoachLM is\nsuccessfully deployed in a data management system for LLMs at Huawei, resulting\nin an efficiency improvement of up to 20% in the cleaning of 40k real-world\ninstruction pairs. We release the training data and code of CoachLM\n(https://github.com/lunyiliu/CoachLM).\n","authors":["Yilun Liu","Shimin Tao","Xiaofeng Zhao","Ming Zhu","Wenbing Ma","Junhao Zhu","Chang Su","Yutai Hou","Miao Zhang","Min Zhang","Hongxia Ma","Li Zhang","Hao Yang","Yanfei Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.13246v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13240v1","updated":"2023-11-22T08:57:55Z","published":"2023-11-22T08:57:55Z","title":"On the Calibration of Large Language Models and Alignment","summary":" As large language models attract increasing attention and find widespread\napplication, concurrent challenges of reliability also arise at the same time.\nConfidence calibration, an effective analysis method for gauging the\nreliability of deep models, serves as a crucial tool for assessing and\nimproving their reliability. However, such investigation has been comparatively\nunderexplored. In this work, we conduct a systematic examination of the\ncalibration of aligned language models throughout the entire construction\nprocess, including pretraining and alignment training. At each stage, we\ninvestigate how different training settings, such as parameter scales and\ntraining data, affect model calibration. To thoroughly assess model\ncalibration, we evaluate models on three most concerned aspects: generation,\nfactuality and understanding. Our work sheds light on whether popular LLMs are\nwell-calibrated and how the training process influences model calibration.\n","authors":["Chiwei Zhu","Benfeng Xu","Quan Wang","Yongdong Zhang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2311.13240v1.pdf","comment":"to be published in findings of EMNLP-2023"},{"id":"http://arxiv.org/abs/2311.12538v2","updated":"2023-11-22T08:44:34Z","published":"2023-11-21T11:33:03Z","title":"In-Context Learning Functions with Varying Number of Minima","summary":" Large Language Models (LLMs) have proven effective at In-Context Learning\n(ICL), an ability that allows them to create predictors from labeled examples.\nFew studies have explored the interplay between ICL and specific properties of\nfunctions it attempts to approximate. In our study, we use a formal framework\nto explore ICL and propose a new task of approximating functions with varying\nnumber of minima. We implement a method that allows for producing functions\nwith given inputs as minima. We find that increasing the number of minima\ndegrades ICL performance. At the same time, our evaluation shows that ICL\noutperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster\nthan 2NN in all settings. We validate the findings through a set of few-shot\nexperiments across various hyperparameter configurations.\n","authors":["David Oniani","Yanshan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13230v1","updated":"2023-11-22T08:39:17Z","published":"2023-11-22T08:39:17Z","title":"Enhancing Uncertainty-Based Hallucination Detection with Stronger Focus","summary":" Large Language Models (LLMs) have gained significant popularity for their\nimpressive performance across diverse fields. However, LLMs are prone to\nhallucinate untruthful or nonsensical outputs that fail to meet user\nexpectations in many real-world applications. Existing works for detecting\nhallucinations in LLMs either rely on external knowledge for reference\nretrieval or require sampling multiple responses from the LLM for consistency\nverification, making these methods costly and inefficient. In this paper, we\npropose a novel reference-free, uncertainty-based method for detecting\nhallucinations in LLMs. Our approach imitates human focus in factuality\nchecking from three aspects: 1) focus on the most informative and important\nkeywords in the given text; 2) focus on the unreliable tokens in historical\ncontext which may lead to a cascade of hallucinations; and 3) focus on the\ntoken properties such as token type and token frequency. Experimental results\non relevant datasets demonstrate the effectiveness of our proposed method,\nwhich achieves state-of-the-art performance across all the evaluation metrics\nand eliminates the need for additional information.\n","authors":["Tianhang Zhang","Lin Qiu","Qipeng Guo","Cheng Deng","Yue Zhang","Zheng Zhang","Chenghu Zhou","Xinbing Wang","Luoyi Fu"],"pdf_url":"https://arxiv.org/pdf/2311.13230v1.pdf","comment":"Accepted by EMNLP 2023 (main conference)"},{"id":"http://arxiv.org/abs/2310.00603v2","updated":"2023-11-22T08:00:10Z","published":"2023-10-01T07:31:04Z","title":"Faithful Explanations of Black-box NLP Models Using LLM-generated\n Counterfactuals","summary":" Causal explanations of the predictions of NLP systems are essential to ensure\nsafety and establish trust. Yet, existing methods often fall short of\nexplaining model predictions effectively or efficiently and are often\nmodel-specific. In this paper, we address model-agnostic explanations,\nproposing two approaches for counterfactual (CF) approximation. The first\napproach is CF generation, where a large language model (LLM) is prompted to\nchange a specific text concept while keeping confounding concepts unchanged.\nWhile this approach is demonstrated to be very effective, applying LLM at\ninference-time is costly. We hence present a second approach based on matching,\nand propose a method that is guided by an LLM at training-time and learns a\ndedicated embedding space. This space is faithful to a given causal graph and\neffectively serves to identify matches that approximate CFs. After showing\ntheoretically that approximating CFs is required in order to construct faithful\nexplanations, we benchmark our approaches and explain several models, including\nLLMs with billions of parameters. Our empirical results demonstrate the\nexcellent performance of CF generation models as model-agnostic explainers.\nMoreover, our matching approach, which requires far less test-time resources,\nalso provides effective explanations, surpassing many baselines. We also find\nthat Top-K techniques universally improve every tested method. Finally, we\nshowcase the potential of LLMs in constructing new benchmarks for model\nexplanation and subsequently validate our conclusions. Our work illuminates new\npathways for efficient and accurate approaches to interpreting NLP systems.\n","authors":["Yair Gat","Nitay Calderon","Amir Feder","Alexander Chapanin","Amit Sharma","Roi Reichart"],"pdf_url":"https://arxiv.org/pdf/2310.00603v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03214v2","updated":"2023-11-22T07:28:19Z","published":"2023-10-05T00:04:12Z","title":"FreshLLMs: Refreshing Large Language Models with Search Engine\n Augmentation","summary":" Most large language models (LLMs) are trained once and never updated; thus,\nthey lack the ability to dynamically adapt to our ever-changing world. In this\nwork, we perform a detailed study of the factuality of LLM-generated text in\nthe context of answering questions that test current world knowledge.\nSpecifically, we introduce FreshQA, a novel dynamic QA benchmark encompassing a\ndiverse range of question and answer types, including questions that require\nfast-changing world knowledge as well as questions with false premises that\nneed to be debunked. We benchmark a diverse array of both closed and\nopen-source LLMs under a two-mode evaluation procedure that allows us to\nmeasure both correctness and hallucination. Through human evaluations involving\nmore than 50K judgments, we shed light on limitations of these models and\ndemonstrate significant room for improvement: for instance, all models\n(regardless of model size) struggle on questions that involve fast-changing\nknowledge and false premises. Motivated by these results, we present\nFreshPrompt, a simple few-shot prompting method that substantially boosts the\nperformance of an LLM on FreshQA by incorporating relevant and up-to-date\ninformation retrieved from a search engine into the prompt. Our experiments\nshow that FreshPrompt outperforms both competing search engine-augmented\nprompting methods such as Self-Ask (Press et al., 2022) as well as commercial\nsystems such as Perplexity.AI. Further analysis of FreshPrompt reveals that\nboth the number of retrieved evidences and their order play a key role in\ninfluencing the correctness of LLM-generated answers. Additionally, instructing\nthe LLM to generate concise and direct answers helps reduce hallucination\ncompared to encouraging more verbose answers. To facilitate future work, we\nrelease FreshQA at github.com/freshllms/freshqa and commit to updating it at\nregular intervals.\n","authors":["Tu Vu","Mohit Iyyer","Xuezhi Wang","Noah Constant","Jerry Wei","Jason Wei","Chris Tar","Yun-Hsuan Sung","Denny Zhou","Quoc Le","Thang Luong"],"pdf_url":"https://arxiv.org/pdf/2310.03214v2.pdf","comment":"Preprint, 26 pages, 10 figures, 5 tables; Added FreshEval"},{"id":"http://arxiv.org/abs/2310.09886v4","updated":"2023-11-22T06:44:16Z","published":"2023-10-15T16:51:11Z","title":"Lifelong Sequence Generation with Dynamic Module Expansion and\n Adaptation","summary":" Lifelong sequence generation (LSG), a problem in continual learning, aims to\ncontinually train a model on a sequence of generation tasks to learn constantly\nemerging new generation patterns while avoiding the forgetting of previous\nknowledge. Existing LSG methods mainly focus on maintaining old knowledge while\npaying little attention to knowledge transfer across tasks. In contrast, humans\ncan better learn new tasks by leveraging previously acquired knowledge from\nsimilar tasks. Inspired by the learning paradigm of humans, we propose Dynamic\nModule Expansion and Adaptation (DMEA), which enables the model to dynamically\ndetermine the architecture for acquiring new knowledge based on task\ncorrelation and select the most similar previous tasks to facilitate adaptation\nto new tasks. In addition, as the learning process can easily be biased towards\nthe current task which might cause more severe forgetting of previously learned\nknowledge, we propose dynamic gradient scaling to balance the learning of the\ncurrent task and replayed tasks. With extensive experiments, we demonstrate\nthat DMEA can consistently outperform existing methods in different LSG\nsettings.\n","authors":["Chengwei Qin","Chen Chen","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2310.09886v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13184v1","updated":"2023-11-22T06:23:18Z","published":"2023-11-22T06:23:18Z","title":"AS-LLM: When Algorithm Selection Meets Large Language Model","summary":" Algorithm selection aims to identify the most suitable algorithm for solving\na specific problem before execution, which has become a critical process of the\nAutoML. Current mainstream algorithm selection techniques rely heavily on\nfeature representations of various problems and employ the performance of each\nalgorithm as supervised information. However, there is a significant research\ngap concerning the consideration of algorithm features. This gap is primarily\nattributed to the inherent complexity of algorithms, making it particularly\nchallenging to find a universally effective feature extraction method that is\napplicable across a diverse range of algorithms. Unfortunately, neglecting this\naspect undoubtedly impacts the accuracy of algorithm selection and indirectly\nnecessitates an increased volume of problem data for training purposes. This\npaper takes a significant stride towards addressing this gap by proposing an\napproach that integrates algorithm representation into the algorithm selection\nprocess. Specifically, our proposed model employs distinct modules to extract\nrepresentations of both problems and algorithms, where the algorithm\nrepresentation leverages the capabilities of pre-trained LLMs in the realm of\ncode comprehension. Following the extraction of embedding vectors for both\nalgorithms and problems, the most suitable algorithm is determined through\ncalculations of matching degrees. Our experiments not only validate the\neffectiveness of the proposed model but also showcase the performance of\ndifferent embedded pre-trained LLMs, which suggests that the proposed algorithm\nselection framework holds the potential to serve as a baseline task for\nevaluating the code representation capabilities of LLMs.\n","authors":["Xingyu Wu","Yan Zhong","Jibin Wu","Kay Chen Tan"],"pdf_url":"https://arxiv.org/pdf/2311.13184v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13171v1","updated":"2023-11-22T05:28:59Z","published":"2023-11-22T05:28:59Z","title":"ComPEFT: Compression for Communicating Parameter Efficient Updates via\n Sparsification and Quantization","summary":" Parameter-efficient fine-tuning (PEFT) techniques make it possible to\nefficiently adapt a language model to create \"expert\" models that specialize to\nnew tasks or domains. Recent techniques in model merging and compositional\ngeneralization leverage these expert models by dynamically composing modules to\nimprove zero/few-shot generalization. Despite the efficiency of PEFT methods,\nthe size of expert models can make it onerous to retrieve expert models per\nquery over high-latency networks like the Internet or serve multiple experts on\na single GPU. To address these issues, we present ComPEFT, a novel method for\ncompressing fine-tuning residuals (task vectors) of PEFT based models. ComPEFT\nemploys sparsification and ternary quantization to reduce the size of the PEFT\nmodule without performing any additional retraining while preserving or\nenhancing model performance. In extensive evaluation across T5, T0, and\nLLaMA-based models with 200M - 65B parameters, ComPEFT achieves compression\nratios of 8x - 50x. In particular, we show that ComPEFT improves with scale -\nstronger models exhibit higher compressibility and better performance. For\nexample, we show that ComPEFT applied to LLaMA outperforms QLoRA by 4.16% on\nMMLU with a storage size reduction of up to 26x. In addition, we show that the\ncompressed experts produced by ComPEFT maintain few-shot compositional\ngeneralization capabilities, facilitate efficient communication and\ncomputation, and exhibit enhanced performance when merged. Lastly, we provide\nan analysis of different method components, compare it with other PEFT methods,\nand test ComPEFT's efficacy for compressing the residual of full-finetuning.\nOur code is available at https://github.com/prateeky2806/compeft.\n","authors":["Prateek Yadav","Leshem Choshen","Colin Raffel","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2311.13171v1.pdf","comment":"25 Pages, 6 Figures, 16 Tables"},{"id":"http://arxiv.org/abs/2311.13133v1","updated":"2023-11-22T03:37:01Z","published":"2023-11-22T03:37:01Z","title":"LIMIT: Less Is More for Instruction Tuning Across Evaluation Paradigms","summary":" Large Language Models are traditionally finetuned on large instruction\ndatasets. However recent studies suggest that small, high-quality datasets can\nsuffice for general purpose instruction following. This lack of consensus\nsurrounding finetuning best practices is in part due to rapidly diverging\napproaches to LLM evaluation. In this study, we ask whether a small amount of\ndiverse finetuning samples can improve performance on both traditional\nperplexity-based NLP benchmarks, and on open-ended, model-based evaluation. We\nfinetune open-source MPT-7B and MPT-30B models on instruction finetuning\ndatasets of various sizes ranging from 1k to 60k samples. We find that subsets\nof 1k-6k instruction finetuning samples are sufficient to achieve good\nperformance on both (1) traditional NLP benchmarks and (2) model-based\nevaluation. Finally, we show that mixing textbook-style and open-ended QA\nfinetuning datasets optimizes performance on both evaluation paradigms.\n","authors":["Aditi Jha","Sam Havens","Jeremey Dohmann","Alex Trott","Jacob Portes"],"pdf_url":"https://arxiv.org/pdf/2311.13133v1.pdf","comment":"36 pages, 12 figures, NeurIPS 2023 Workshop on Instruction Tuning and\n Instruction Following"},{"id":"http://arxiv.org/abs/2311.13126v1","updated":"2023-11-22T03:28:34Z","published":"2023-11-22T03:28:34Z","title":"Towards Better Parameter-Efficient Fine-Tuning for Large Language\n Models: A Position Paper","summary":" This paper delves into the pressing need in Parameter-Efficient Fine-Tuning\n(PEFT) for Large Language Models (LLMs). While LLMs possess remarkable\ncapabilities, their extensive parameter requirements and associated\ncomputational demands hinder their practicality and scalability for real-world\napplications. Our position paper highlights current states and the necessity of\nfurther studying into the topic, and recognizes significant challenges and open\nissues that must be addressed to fully harness the powerful abilities of LLMs.\nThese challenges encompass novel efficient PEFT architectures, PEFT for\ndifferent learning settings, PEFT combined with model compression techniques,\nand the exploration of PEFT for multi-modal LLMs. By presenting this position\npaper, we aim to stimulate further research and foster discussions surrounding\nmore efficient and accessible PEFT for LLMs.\n","authors":["Chengyu Wang","Junbing Yan","Wei Zhang","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13118v1","updated":"2023-11-22T02:45:01Z","published":"2023-11-22T02:45:01Z","title":"Combatting Human Trafficking in the Cyberspace: A Natural Language\n Processing-Based Methodology to Analyze the Language in Online Advertisements","summary":" This project tackles the pressing issue of human trafficking in online C2C\nmarketplaces through advanced Natural Language Processing (NLP) techniques. We\nintroduce a novel methodology for generating pseudo-labeled datasets with\nminimal supervision, serving as a rich resource for training state-of-the-art\nNLP models. Focusing on tasks like Human Trafficking Risk Prediction (HTRP) and\nOrganized Activity Detection (OAD), we employ cutting-edge Transformer models\nfor analysis. A key contribution is the implementation of an interpretability\nframework using Integrated Gradients, providing explainable insights crucial\nfor law enforcement. This work not only fills a critical gap in the literature\nbut also offers a scalable, machine learning-driven approach to combat human\nexploitation online. It serves as a foundation for future research and\npractical applications, emphasizing the role of machine learning in addressing\ncomplex social issues.\n","authors":["Alejandro Rodriguez Perez","Pablo Rivas"],"pdf_url":"https://arxiv.org/pdf/2311.13118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v1","updated":"2023-11-22T02:23:32Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v1.pdf","comment":"This paper integrates the works arXiv:2306.01129 and\n arXiv:2308.16271, as well as this under-review work:\n https://openreview.net/forum?id=PvyOYleymy into a complete story. In this\n paper, we improve the writing and organization, and also add conceptual,\n empirical, and theoretical improvements over the previous work"},{"id":"http://arxiv.org/abs/2311.13105v1","updated":"2023-11-22T02:12:36Z","published":"2023-11-22T02:12:36Z","title":"Perceptual Structure in the Absence of Grounding for LLMs: The Impact of\n Abstractedness and Subjectivity in Color Language","summary":" The need for grounding in language understanding is an active research topic.\nPrevious work has suggested that color perception and color language appear as\na suitable test bed to empirically study the problem, given its cognitive\nsignificance and showing that there is considerable alignment between a defined\ncolor space and the feature space defined by a language model. To further study\nthis issue, we collect a large scale source of colors and their descriptions,\ncontaining almost a 1 million examples , and perform an empirical analysis to\ncompare two kinds of alignments: (i) inter-space, by learning a mapping between\nembedding space and color space, and (ii) intra-space, by means of prompting\ncomparatives between color descriptions. Our results show that while color\nspace alignment holds for monolexemic, highly pragmatic color descriptions,\nthis alignment drops considerably in the presence of examples that exhibit\nelements of real linguistic usage such as subjectivity and abstractedness,\nsuggesting that grounding may be required in such cases.\n","authors":["Pablo Loyola","Edison Marrese-Taylor","Andres Hoyos-Idobro"],"pdf_url":"https://arxiv.org/pdf/2311.13105v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.13102v1","updated":"2023-11-22T02:04:35Z","published":"2023-11-22T02:04:35Z","title":"Detecting out-of-distribution text using topological features of\n transformer-based language models","summary":" We attempt to detect out-of-distribution (OOD) text samples though applying\nTopological Data Analysis (TDA) to attention maps in transformer-based language\nmodels. We evaluate our proposed TDA-based approach for out-of-distribution\ndetection on BERT, a transformer-based language model, and compare the to a\nmore traditional OOD approach based on BERT CLS embeddings. We found that our\nTDA approach outperforms the CLS embedding approach at distinguishing\nin-distribution data (politics and entertainment news articles from HuffPost)\nfrom far out-of-domain samples (IMDB reviews), but its effectiveness\ndeteriorates with near out-of-domain (CNN/Dailymail) or same-domain (business\nnews articles from HuffPost) datasets.\n","authors":["Andres Pollano","Anupam Chaudhuri","Anj Simmons"],"pdf_url":"https://arxiv.org/pdf/2311.13102v1.pdf","comment":"12 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.13095v1","updated":"2023-11-22T01:51:50Z","published":"2023-11-22T01:51:50Z","title":"Enhancing Logical Reasoning in Large Language Models to Facilitate Legal\n Applications","summary":" Language serves as a vehicle for conveying thought, enabling communication\namong individuals. The ability to distinguish between diverse concepts,\nidentify fairness and injustice, and comprehend a range of legal notions\nfundamentally relies on logical reasoning. Large Language Models (LLMs) attempt\nto emulate human language understanding and generation, but their competency in\nlogical reasoning remains limited. This paper seeks to address the\nphilosophical question: How can we effectively teach logical reasoning to LLMs\nwhile maintaining a deep understanding of the intricate relationship between\nlanguage and logic? By focusing on bolstering LLMs' capabilities in logical\nreasoning, we aim to expand their applicability in law and other\nlogic-intensive disciplines. To this end, we propose a Reinforcement Learning\nfrom Logical Feedback (RLLF) approach, which serves as a potential framework\nfor refining LLMs' reasoning capacities. Through RLLF and a revised evaluation\nmethodology, we explore new avenues for research in this domain and contribute\nto the development of LLMs capable of handling complex legal reasoning tasks\nwhile acknowledging the fundamental connection between language and logic.\n","authors":["Ha-Thanh Nguyen","Wachara Fungwacharakorn","Ken Satoh"],"pdf_url":"https://arxiv.org/pdf/2311.13095v1.pdf","comment":"ALP@JURIX2023"},{"id":"http://arxiv.org/abs/2310.12942v3","updated":"2023-11-22T01:39:59Z","published":"2023-10-19T17:39:47Z","title":"On the Representational Capacity of Recurrent Neural Language Models","summary":" This work investigates the computational expressivity of language models\n(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992)\nfamously showed that RNNs with rational weights and hidden states and unbounded\ncomputation time are Turing complete. However, LMs define weightings over\nstrings in addition to just (unweighted) language membership and the analysis\nof the computational power of RNN LMs (RLMs) should reflect this. We extend the\nTuring completeness result to the probabilistic case, showing how a rationally\nweighted RLM with unbounded computation time can simulate any deterministic\nprobabilistic Turing machine (PTM) with rationally weighted transitions. Since,\nin practice, RLMs work in real-time, processing a symbol at every time step, we\ntreat the above result as an upper bound on the expressivity of RLMs. We also\nprovide a lower bound by showing that under the restriction to real-time\ncomputation, such models can simulate deterministic real-time rational PTMs.\n","authors":["Franz Nowak","Anej Svete","Li Du","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2310.12942v3.pdf","comment":"To be published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.12401v2","updated":"2023-11-22T23:56:43Z","published":"2023-05-21T08:51:24Z","title":"WOT-Class: Weakly Supervised Open-world Text Classification","summary":" State-of-the-art weakly supervised text classification methods, while\nsignificantly reduced the required human supervision, still requires the\nsupervision to cover all the classes of interest. This is never easy to meet in\npractice when human explore new, large corpora without complete pictures. In\nthis paper, we work on a novel yet important problem of weakly supervised\nopen-world text classification, where supervision is only needed for a few\nexamples from a few known classes and the machine should handle both known and\nunknown classes in test time. General open-world classification has been\nstudied mostly using image classification; however, existing methods typically\nassume the availability of sufficient known-class supervision and strong\nunknown-class prior knowledge (e.g., the number and/or data distribution). We\npropose a novel framework WOT-Class that lifts those strong assumptions.\nSpecifically, it follows an iterative process of (a) clustering text to new\nclasses, (b) mining and ranking indicative words for each class, and (c)\nmerging redundant classes by using the overlapped indicative words as a bridge.\nExtensive experiments on 7 popular text classification datasets demonstrate\nthat WOT-Class outperforms strong baselines consistently with a large margin,\nattaining 23.33% greater average absolute macro-F1 over existing approaches\nacross all datasets. Such competent accuracy illuminates the practical\npotential of further reducing human effort for text classification.\n","authors":["Tianle Wang","Zihan Wang","Weitang Liu","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2305.12401v2.pdf","comment":"Accepted by CIKM 2023"},{"id":"http://arxiv.org/abs/2311.13735v1","updated":"2023-11-22T23:35:13Z","published":"2023-11-22T23:35:13Z","title":"Surpassing GPT-4 Medical Coding with a Two-Stage Approach","summary":" Recent advances in large language models (LLMs) show potential for clinical\napplications, such as clinical decision support and trial recommendations.\nHowever, the GPT-4 LLM predicts an excessive number of ICD codes for medical\ncoding tasks, leading to high recall but low precision. To tackle this\nchallenge, we introduce LLM-codex, a two-stage approach to predict ICD codes\nthat first generates evidence proposals using an LLM and then employs an\nLSTM-based verification stage. The LSTM learns from both the LLM's high recall\nand human expert's high precision, using a custom loss function. Our model is\nthe only approach that simultaneously achieves state-of-the-art results in\nmedical coding accuracy, accuracy on rare codes, and sentence-level evidence\nidentification to support coding decisions without training on human-annotated\nevidence according to experiments on the MIMIC dataset.\n","authors":["Zhichao Yang","Sanjit Singh Batra","Joel Stremmel","Eran Halperin"],"pdf_url":"https://arxiv.org/pdf/2311.13735v1.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 19 pages"},{"id":"http://arxiv.org/abs/2311.13729v1","updated":"2023-11-22T22:52:00Z","published":"2023-11-22T22:52:00Z","title":"Comparison of pipeline, sequence-to-sequence, and GPT models for\n end-to-end relation extraction: experiments with the rare disease use-case","summary":" End-to-end relation extraction (E2ERE) is an important and realistic\napplication of natural language processing (NLP) in biomedicine. In this paper,\nwe aim to compare three prevailing paradigms for E2ERE using a complex dataset\nfocused on rare diseases involving discontinuous and nested entities. We use\nthe RareDis information extraction dataset to evaluate three competing\napproaches (for E2ERE): NER $\\rightarrow$ RE pipelines, joint sequence to\nsequence models, and generative pre-trained transformer (GPT) models. We use\ncomparable state-of-the-art models and best practices for each of these\napproaches and conduct error analyses to assess their failure modes. Our\nfindings reveal that pipeline models are still the best, while\nsequence-to-sequence models are not far behind; GPT models with eight times as\nmany parameters are worse than even sequence-to-sequence models and lose to\npipeline models by over 10 F1 points. Partial matches and discontinuous\nentities caused many NER errors contributing to lower overall E2E performances.\nWe also verify these findings on a second E2ERE dataset for chemical-protein\ninteractions. Although generative LM-based methods are more suitable for\nzero-shot settings, when training data is available, our results show that it\nis better to work with more conventional models trained and tailored for E2ERE.\nMore innovative methods are needed to marry the best of the both worlds from\nsmaller encoder-decoder pipeline models and the larger GPT models to improve\nE2ERE. As of now, we see that well designed pipeline models offer substantial\nperformance gains at a lower cost and carbon footprint for E2ERE. Our\ncontribution is also the first to conduct E2ERE for the RareDis dataset.\n","authors":["Shashank Gupta","Xuguang Ai","Ramakanth Kavuluru"],"pdf_url":"https://arxiv.org/pdf/2311.13729v1.pdf","comment":"The dataset and code for all our experiments are publicly available:\n https://github.com/shashank140195/Raredis"},{"id":"http://arxiv.org/abs/2311.13708v1","updated":"2023-11-22T21:59:46Z","published":"2023-11-22T21:59:46Z","title":"Dynamic Analysis Method for Hidden Dangers in Substation Based on\n Knowledge Graph","summary":" To address the challenge of identifying and understanding hidden dangers in\nsubstations from unstructured text data, a novel dynamic analysis method is\nproposed. This approach begins by analyzing and extracting data from the\nunstructured text related to hidden dangers. It then leverages a flexible,\ndistributed data search engine built on Elastic-Search to handle this\ninformation. Following this, the hidden Markov model is employed to train the\ndata within the engine. The Viterbi algorithm is integrated to decipher the\nhidden state sequences, facilitating the segmentation and labeling of entities\nrelated to hidden dangers. The final step involves using the Neo4j graph\ndatabase to dynamically create a knowledge map that visualizes hidden dangers\nin the substation. This method's effectiveness is demonstrated through an\nexample analysis using data from a specific substation's hidden dangers.\n","authors":["Weiwei Li","Xing Liu","Wei Wang","Lu Chen","Sizhe Li","Hui Fan"],"pdf_url":"https://arxiv.org/pdf/2311.13708v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10057v3","updated":"2023-11-22T21:22:11Z","published":"2023-11-16T17:52:21Z","title":"The Song Describer Dataset: a Corpus of Audio Captions for\n Music-and-Language Evaluation","summary":" We introduce the Song Describer dataset (SDD), a new crowdsourced corpus of\nhigh-quality audio-caption pairs, designed for the evaluation of\nmusic-and-language models. The dataset consists of 1.1k human-written natural\nlanguage descriptions of 706 music recordings, all publicly accessible and\nreleased under Creative Common licenses. To showcase the use of our dataset, we\nbenchmark popular models on three key music-and-language tasks (music\ncaptioning, text-to-music generation and music-language retrieval). Our\nexperiments highlight the importance of cross-dataset evaluation and offer\ninsights into how researchers can use SDD to gain a broader understanding of\nmodel performance.\n","authors":["Ilaria Manco","Benno Weck","SeungHeon Doh","Minz Won","Yixiao Zhang","Dmitry Bogdanov","Yusong Wu","Ke Chen","Philip Tovstogan","Emmanouil Benetos","Elio Quinton","György Fazekas","Juhan Nam"],"pdf_url":"https://arxiv.org/pdf/2311.10057v3.pdf","comment":"Accepted to NeurIPS 2023 Workshop on Machine Learning for Audio"},{"id":"http://arxiv.org/abs/2311.13668v1","updated":"2023-11-22T19:45:40Z","published":"2023-11-22T19:45:40Z","title":"MAIRA-1: A specialised large multimodal model for radiology report\n generation","summary":" We present a radiology-specific multimodal model for the task for generating\nradiological reports from chest X-rays (CXRs). Our work builds on the idea that\nlarge language model(s) can be equipped with multimodal capabilities through\nalignment with pre-trained vision encoders. On natural images, this has been\nshown to allow multimodal models to gain image understanding and description\ncapabilities. Our proposed model (MAIRA-1) leverages a CXR-specific image\nencoder in conjunction with a fine-tuned large language model based on\nVicuna-7B, and text-based data augmentation, to produce reports with\nstate-of-the-art quality. In particular, MAIRA-1 significantly improves on the\nradiologist-aligned RadCliQ metric and across all lexical metrics considered.\nManual review of model outputs demonstrates promising fluency and accuracy of\ngenerated reports while uncovering failure modes not captured by existing\nevaluation practices. More information and resources can be found on the\nproject website: https://aka.ms/maira.\n","authors":["Stephanie L. Hyland","Shruthi Bannur","Kenza Bouzid","Daniel C. Castro","Mercy Ranjit","Anton Schwaighofer","Fernando Pérez-García","Valentina Salvatelli","Shaury Srivastav","Anja Thieme","Noel Codella","Matthew P. Lungren","Maria Teodora Wetscherek","Ozan Oktay","Javier Alvarez-Valle"],"pdf_url":"https://arxiv.org/pdf/2311.13668v1.pdf","comment":"18 pages, 9 tables, 5 figures"},{"id":"http://arxiv.org/abs/2311.13657v1","updated":"2023-11-22T19:19:37Z","published":"2023-11-22T19:19:37Z","title":"Efficient Transformer Knowledge Distillation: A Performance Review","summary":" As pretrained transformer language models continue to achieve\nstate-of-the-art performance, the Natural Language Processing community has\npushed for advances in model compression and efficient attention mechanisms to\naddress high computational requirements and limited input sequence length.\nDespite these separate efforts, no investigation has been done into the\nintersection of these two fields. In this work, we provide an evaluation of\nmodel compression via knowledge distillation on efficient attention\ntransformers. We provide cost-performance trade-offs for the compression of\nstate-of-the-art efficient attention architectures and the gains made in\nperformance in comparison to their full attention counterparts. Furthermore, we\nintroduce a new long-context Named Entity Recognition dataset, GONERD, to train\nand test the performance of NER models on long sequences. We find that\ndistilled efficient attention transformers can preserve a significant amount of\noriginal model performance, preserving up to 98.6% across short-context tasks\n(GLUE, SQUAD, CoNLL-2003), up to 94.6% across long-context\nQuestion-and-Answering tasks (HotpotQA, TriviaQA), and up to 98.8% on\nlong-context Named Entity Recognition (GONERD), while decreasing inference\ntimes by up to 57.8%. We find that, for most models on most tasks, performing\nknowledge distillation is an effective method to yield high-performing\nefficient attention models with low costs.\n","authors":["Nathan Brown","Ashton Williamson","Tahj Anderson","Logan Lawrence"],"pdf_url":"https://arxiv.org/pdf/2311.13657v1.pdf","comment":"Accepted to EMNLP 2023. 12 pages, 1 figure, 11 tables. Models and\n data available at https://huggingface.co/giant-oak"},{"id":"http://arxiv.org/abs/2311.13647v1","updated":"2023-11-22T19:04:04Z","published":"2023-11-22T19:04:04Z","title":"Language Model Inversion","summary":" Language models produce a distribution over the next token; can we use this\ninformation to recover the prompt tokens? We consider the problem of language\nmodel inversion and show that next-token probabilities contain a surprising\namount of information about the preceding text. Often we can recover the text\nin cases where it is hidden from the user, motivating a method for recovering\nunknown prompts given only the model's current distribution output. We consider\na variety of model access scenarios, and show how even without predictions for\nevery token in the vocabulary we can recover the probability vector through\nsearch. On Llama-2 7b, our inversion method reconstructs prompts with a BLEU of\n$59$ and token-level F1 of $78$ and recovers $27\\%$ of prompts exactly. Code\nfor reproducing all experiments is available at\nhttp://github.com/jxmorris12/vec2text.\n","authors":["John X. Morris","Wenting Zhao","Justin T. Chiu","Vitaly Shmatikov","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2311.13647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13628v1","updated":"2023-11-22T18:50:47Z","published":"2023-11-22T18:50:47Z","title":"Prompt Risk Control: A Rigorous Framework for Responsible Deployment of\n Large Language Models","summary":" The recent explosion in the capabilities of large language models has led to\na wave of interest in how best to prompt a model to perform a given task. While\nit may be tempting to simply choose a prompt based on average performance on a\nvalidation set, this can lead to a deployment where unexpectedly poor responses\nare generated, especially for the worst-off users. To mitigate this prospect,\nwe propose Prompt Risk Control, a lightweight framework for selecting a prompt\nbased on rigorous upper bounds on families of informative risk measures. We\noffer methods for producing bounds on a diverse set of metrics, including\nquantities that measure worst-case responses and disparities in generation\nquality across the population of users. In addition, we extend the underlying\nstatistical bounding techniques to accommodate the possibility of distribution\nshifts in deployment. Experiments on applications such as open-ended chat,\nmedical question summarization, and code generation highlight how such a\nframework can foster responsible deployment by reducing the risk of the worst\noutcomes.\n","authors":["Thomas P. Zollo","Todd Morrill","Zhun Deng","Jake C. Snell","Toniann Pitassi","Richard Zemel"],"pdf_url":"https://arxiv.org/pdf/2311.13628v1.pdf","comment":"33 pages, 10 figures, and accepted to the Socially Responsible\n Language Modelling Research (SoLaR) workshop at NeurIPS 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.13602v1","updated":"2023-11-22T18:59:53Z","published":"2023-11-22T18:59:53Z","title":"Retrieval-Augmented Layout Transformer for Content-Aware Layout\n Generation","summary":" Content-aware graphic layout generation aims to automatically arrange visual\nelements along with a given content, such as an e-commerce product image. In\nthis paper, we argue that the current layout generation approaches suffer from\nthe limited training data for the high-dimensional layout structure. We show\nthat a simple retrieval augmentation can significantly improve the generation\nquality. Our model, which is named Retrieval-Augmented Layout Transformer\n(RALF), retrieves nearest neighbor layout examples based on an input image and\nfeeds these results into an autoregressive generator. Our model can apply\nretrieval augmentation to various controllable generation tasks and yield\nhigh-quality layouts within a unified architecture. Our extensive experiments\nshow that RALF successfully generates content-aware layouts in both constrained\nand unconstrained settings and significantly outperforms the baselines.\n","authors":["Daichi Horita","Naoto Inoue","Kotaro Kikuchi","Kota Yamaguchi","Kiyoharu Aizawa"],"pdf_url":"https://arxiv.org/pdf/2311.13602v1.pdf","comment":"Webpage: https://udonda.github.io/RALF/"},{"id":"http://arxiv.org/abs/2311.13601v1","updated":"2023-11-22T18:59:48Z","published":"2023-11-22T18:59:48Z","title":"Visual In-Context Prompting","summary":" In-context prompting in large language models (LLMs) has become a prevalent\napproach to improve zero-shot capabilities, but this idea is less explored in\nthe vision domain. Existing visual prompting methods focus on referring\nsegmentation to segment the most relevant object, falling short of addressing\nmany generic vision tasks like open-set segmentation and detection. In this\npaper, we introduce a universal visual in-context prompting framework for both\ntasks. In particular, we build on top of an encoder-decoder architecture, and\ndevelop a versatile prompt encoder to support a variety of prompts like\nstrokes, boxes, and points. We further enhance it to take an arbitrary number\nof reference image segments as the context. Our extensive explorations show\nthat the proposed visual in-context prompting elicits extraordinary referring\nand generic segmentation capabilities to refer and detect, yielding competitive\nperformance to close-set in-domain datasets and showing promising results on\nmany open-set segmentation datasets. By joint training on COCO and SA-1B, our\nmodel achieves $57.7$ PQ on COCO and $23.2$ PQ on ADE20K. Code will be\navailable at https://github.com/UX-Decoder/DINOv.\n","authors":["Feng Li","Qing Jiang","Hao Zhang","Tianhe Ren","Shilong Liu","Xueyan Zou","Huaizhe Xu","Hongyang Li","Chunyuan Li","Jianwei Yang","Lei Zhang","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2311.13601v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2311.13600v1","updated":"2023-11-22T18:59:36Z","published":"2023-11-22T18:59:36Z","title":"ZipLoRA: Any Subject in Any Style by Effectively Merging LoRAs","summary":" Methods for finetuning generative models for concept-driven personalization\ngenerally achieve strong results for subject-driven or style-driven generation.\nRecently, low-rank adaptations (LoRA) have been proposed as a\nparameter-efficient way of achieving concept-driven personalization. While\nrecent work explores the combination of separate LoRAs to achieve joint\ngeneration of learned styles and subjects, existing techniques do not reliably\naddress the problem; they often compromise either subject fidelity or style\nfidelity. We propose ZipLoRA, a method to cheaply and effectively merge\nindependently trained style and subject LoRAs in order to achieve generation of\nany user-provided subject in any user-provided style. Experiments on a wide\nrange of subject and style combinations show that ZipLoRA can generate\ncompelling results with meaningful improvements over baselines in subject and\nstyle fidelity while preserving the ability to recontextualize. Project page:\nhttps://ziplora.github.io\n","authors":["Viraj Shah","Nataniel Ruiz","Forrester Cole","Erika Lu","Svetlana Lazebnik","Yuanzhen Li","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2311.13600v1.pdf","comment":"Project page: https://ziplora.github.io"},{"id":"http://arxiv.org/abs/2311.13596v1","updated":"2023-11-22T18:57:24Z","published":"2023-11-22T18:57:24Z","title":"T-Rex: Counting by Visual Prompting","summary":" We introduce T-Rex, an interactive object counting model designed to first\ndetect and then count any objects. We formulate object counting as an open-set\nobject detection task with the integration of visual prompts. Users can specify\nthe objects of interest by marking points or boxes on a reference image, and\nT-Rex then detects all objects with a similar pattern. Guided by the visual\nfeedback from T-Rex, users can also interactively refine the counting results\nby prompting on missing or falsely-detected objects. T-Rex has achieved\nstate-of-the-art performance on several class-agnostic counting benchmarks. To\nfurther exploit its potential, we established a new counting benchmark\nencompassing diverse scenarios and challenges. Both quantitative and\nqualitative results show that T-Rex possesses exceptional zero-shot counting\ncapabilities. We also present various practical application scenarios for\nT-Rex, illustrating its potential in the realm of visual prompting.\n","authors":["Qing Jiang","Feng Li","Tianhe Ren","Shilong Liu","Zhaoyang Zeng","Kent Yu","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13596v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2311.12764v2","updated":"2023-11-22T18:52:11Z","published":"2023-11-21T18:18:50Z","title":"Investigating Weight-Perturbed Deep Neural Networks With Application in\n Iris Presentation Attack Detection","summary":" Deep neural networks (DNNs) exhibit superior performance in various machine\nlearning tasks, e.g., image classification, speech recognition, biometric\nrecognition, object detection, etc. However, it is essential to analyze their\nsensitivity to parameter perturbations before deploying them in real-world\napplications. In this work, we assess the sensitivity of DNNs against\nperturbations to their weight and bias parameters. The sensitivity analysis\ninvolves three DNN architectures (VGG, ResNet, and DenseNet), three types of\nparameter perturbations (Gaussian noise, weight zeroing, and weight scaling),\nand two settings (entire network and layer-wise). We perform experiments in the\ncontext of iris presentation attack detection and evaluate on two publicly\navailable datasets: LivDet-Iris-2017 and LivDet-Iris-2020. Based on the\nsensitivity analysis, we propose improved models simply by perturbing\nparameters of the network without undergoing training. We further combine these\nperturbed models at the score-level and at the parameter-level to improve the\nperformance over the original model. The ensemble at the parameter-level shows\nan average improvement of 43.58% on the LivDet-Iris-2017 dataset and 9.25% on\nthe LivDet-Iris-2020 dataset. The source code is available at\nhttps://github.com/redwankarimsony/WeightPerturbation-MSU.\n","authors":["Renu Sharma","Redwan Sony","Arun Ross"],"pdf_url":"https://arxiv.org/pdf/2311.12764v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13574v1","updated":"2023-11-22T18:30:42Z","published":"2023-11-22T18:30:42Z","title":"XAGen: 3D Expressive Human Avatars Generation","summary":" Recent advances in 3D-aware GAN models have enabled the generation of\nrealistic and controllable human body images. However, existing methods focus\non the control of major body joints, neglecting the manipulation of expressive\nattributes, such as facial expressions, jaw poses, hand poses, and so on. In\nthis work, we present XAGen, the first 3D generative model for human avatars\ncapable of expressive control over body, face, and hands. To enhance the\nfidelity of small-scale regions like face and hands, we devise a multi-scale\nand multi-part 3D representation that models fine details. Based on this\nrepresentation, we propose a multi-part rendering technique that disentangles\nthe synthesis of body, face, and hands to ease model training and enhance\ngeometric quality. Furthermore, we design multi-part discriminators that\nevaluate the quality of the generated avatars with respect to their appearance\nand fine-grained control capabilities. Experiments show that XAGen surpasses\nstate-of-the-art methods in terms of realism, diversity, and expressive control\nabilities. Code and data will be made available at\nhttps://showlab.github.io/xagen.\n","authors":["Zhongcong Xu","Jianfeng Zhang","Jun Hao Liew","Jiashi Feng","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2311.13574v1.pdf","comment":"Accepted to NeurIPS 2023, Project Page at\n https://showlab.github.io/xagen"},{"id":"http://arxiv.org/abs/2311.13570v1","updated":"2023-11-22T18:25:51Z","published":"2023-11-22T18:25:51Z","title":"WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space","summary":" Modern learning-based approaches to 3D-aware image synthesis achieve high\nphotorealism and 3D-consistent viewpoint changes for the generated images.\nExisting approaches represent instances in a shared canonical space. However,\nfor in-the-wild datasets a shared canonical system can be difficult to define\nor might not even exist. In this work, we instead model instances in view\nspace, alleviating the need for posed images and learned camera distributions.\nWe find that in this setting, existing GAN-based methods are prone to\ngenerating flat geometry and struggle with distribution coverage. We hence\npropose WildFusion, a new approach to 3D-aware image synthesis based on latent\ndiffusion models (LDMs). We first train an autoencoder that infers a compressed\nlatent representation, which additionally captures the images' underlying 3D\nstructure and enables not only reconstruction but also novel view synthesis. To\nlearn a faithful 3D representation, we leverage cues from monocular depth\nprediction. Then, we train a diffusion model in the 3D-aware latent space,\nthereby enabling synthesis of high-quality 3D-consistent image samples,\noutperforming recent state-of-the-art GAN-based methods. Importantly, our\n3D-aware LDM is trained without any direct supervision from multiview images or\n3D geometry and does not require posed images or learned pose or camera\ndistributions. It directly learns a 3D representation without relying on\ncanonical camera coordinates. This opens up promising research avenues for\nscalable 3D-aware image synthesis and 3D content creation from in-the-wild\nimage data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D\nresults.\n","authors":["Katja Schwarz","Seung Wook Kim","Jun Gao","Sanja Fidler","Andreas Geiger","Karsten Kreis"],"pdf_url":"https://arxiv.org/pdf/2311.13570v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13562v1","updated":"2023-11-22T18:15:43Z","published":"2023-11-22T18:15:43Z","title":"Soulstyler: Using Large Language Model to Guide Image Style Transfer for\n Target Object","summary":" Image style transfer occupies an important place in both computer graphics\nand computer vision. However, most current methods require reference to\nstylized images and cannot individually stylize specific objects. To overcome\nthis limitation, we propose the \"Soulstyler\" framework, which allows users to\nguide the stylization of specific objects in an image through simple textual\ndescriptions. We introduce a large language model to parse the text and\nidentify stylization goals and specific styles. Combined with a CLIP-based\nsemantic visual embedding encoder, the model understands and matches text and\nimage content. We also introduce a novel localized text-image block matching\nloss that ensures that style transfer is performed only on specified target\nobjects, while non-target regions remain in their original style. Experimental\nresults demonstrate that our model is able to accurately perform style transfer\non target objects according to textual descriptions without affecting the style\nof background regions. Our code will be available at\nhttps://github.com/yisuanwang/Soulstyler.\n","authors":["Junhao Chen","Peng Rong","Jingbo Sun","Chao Li","Xiang Li","Hongwu Lv"],"pdf_url":"https://arxiv.org/pdf/2311.13562v1.pdf","comment":"5 pages,3 figures,ICASSP2024"},{"id":"http://arxiv.org/abs/2311.13559v1","updated":"2023-11-22T18:09:42Z","published":"2023-11-22T18:09:42Z","title":"Transfer Learning-based Real-time Handgun Detection","summary":" Traditional surveillance systems rely on human attention, limiting their\neffectiveness. This study employs convolutional neural networks and transfer\nlearning to develop a real-time computer vision system for automatic handgun\ndetection. Comprehensive analysis of online handgun detection methods is\nconducted, emphasizing reducing false positives and learning time. Transfer\nlearning is demonstrated as an effective approach. Despite technical\nchallenges, the proposed system achieves a precision rate of 84.74%,\ndemonstrating promising performance comparable to related works, enabling\nfaster learning and accurate automatic handgun detection for enhanced security.\nThis research advances security measures by reducing human monitoring\ndependence, showcasing the potential of transfer learning-based approaches for\nefficient and reliable handgun detection.\n","authors":["Youssef Elmir","Sid Ahmed Laouar","Larbi Hamdaoui"],"pdf_url":"https://arxiv.org/pdf/2311.13559v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13549v1","updated":"2023-11-22T17:44:29Z","published":"2023-11-22T17:44:29Z","title":"ADriver-I: A General World Model for Autonomous Driving","summary":" Typically, autonomous driving adopts a modular design, which divides the full\nstack into perception, prediction, planning and control parts. Though\ninterpretable, such modular design tends to introduce a substantial amount of\nredundancy. Recently, multimodal large language models (MLLM) and diffusion\ntechniques have demonstrated their superior performance on comprehension and\ngeneration ability. In this paper, we first introduce the concept of\ninterleaved vision-action pair, which unifies the format of visual features and\ncontrol signals. Based on the vision-action pairs, we construct a general world\nmodel based on MLLM and diffusion model for autonomous driving, termed\nADriver-I. It takes the vision-action pairs as inputs and autoregressively\npredicts the control signal of the current frame. The generated control signals\ntogether with the historical vision-action pairs are further conditioned to\npredict the future frames. With the predicted next frame, ADriver-I performs\nfurther control signal prediction. Such a process can be repeated infinite\ntimes, ADriver-I achieves autonomous driving in the world created by itself.\nExtensive experiments are conducted on nuScenes and our large-scale private\ndatasets. ADriver-I shows impressive performance compared to several\nconstructed baselines. We hope our ADriver-I can provide some new insights for\nfuture autonomous driving and embodied intelligence.\n","authors":["Fan Jia","Weixin Mao","Yingfei Liu","Yucheng Zhao","Yuqing Wen","Chi Zhang","Xiangyu Zhang","Tiancai Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13549v1.pdf","comment":"Tech Report"},{"id":"http://arxiv.org/abs/2311.13547v1","updated":"2023-11-22T17:42:33Z","published":"2023-11-22T17:42:33Z","title":"Medical Image Retrieval Using Pretrained Embeddings","summary":" A wide range of imaging techniques and data formats available for medical\nimages make accurate retrieval from image databases challenging.\n Efficient retrieval systems are crucial in advancing medical research,\nenabling large-scale studies and innovative diagnostic tools. Thus, addressing\nthe challenges of medical image retrieval is essential for the continued\nenhancement of healthcare and research.\n In this study, we evaluated the feasibility of employing four\nstate-of-the-art pretrained models for medical image retrieval at modality,\nbody region, and organ levels and compared the results of two similarity\nindexing approaches. Since the employed networks take 2D images, we analyzed\nthe impacts of weighting and sampling strategies to incorporate 3D information\nduring retrieval of 3D volumes. We showed that medical image retrieval is\nfeasible using pretrained networks without any additional training or\nfine-tuning steps. Using pretrained embeddings, we achieved a recall of 1 for\nvarious tasks at modality, body region, and organ level.\n","authors":["Farnaz Khun Jush","Tuan Truong","Steffen Vogler","Matthias Lenga"],"pdf_url":"https://arxiv.org/pdf/2311.13547v1.pdf","comment":"8 pages, 3 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.13535v1","updated":"2023-11-22T17:16:44Z","published":"2023-11-22T17:16:44Z","title":"DiffusionMat: Alpha Matting as Sequential Refinement Learning","summary":" In this paper, we introduce DiffusionMat, a novel image matting framework\nthat employs a diffusion model for the transition from coarse to refined alpha\nmattes. Diverging from conventional methods that utilize trimaps merely as\nloose guidance for alpha matte prediction, our approach treats image matting as\na sequential refinement learning process. This process begins with the addition\nof noise to trimaps and iteratively denoises them using a pre-trained diffusion\nmodel, which incrementally guides the prediction towards a clean alpha matte.\nThe key innovation of our framework is a correction module that adjusts the\noutput at each denoising step, ensuring that the final result is consistent\nwith the input image's structures. We also introduce the Alpha Reliability\nPropagation, a novel technique designed to maximize the utility of available\nguidance by selectively enhancing the trimap regions with confident alpha\ninformation, thus simplifying the correction task. To train the correction\nmodule, we devise specialized loss functions that target the accuracy of the\nalpha matte's edges and the consistency of its opaque and transparent regions.\nWe evaluate our model across several image matting benchmarks, and the results\nindicate that DiffusionMat consistently outperforms existing methods. Project\npage at~\\url{https://cnnlstm.github.io/DiffusionMat\n","authors":["Yangyang Xu","Shengfeng He","Wenqi Shao","Kwan-Yee K. Wong","Yu Qiao","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2311.13535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13531v1","updated":"2023-11-22T17:06:57Z","published":"2023-11-22T17:06:57Z","title":"Leveraging CNNs and Ensemble Learning for Automated Disaster Image\n Classification","summary":" Natural disasters act as a serious threat globally, requiring effective and\nefficient disaster management and recovery. This paper focuses on classifying\nnatural disaster images using Convolutional Neural Networks (CNNs). Multiple\nCNN architectures were built and trained on a dataset containing images of\nearthquakes, floods, wildfires, and volcanoes. A stacked CNN ensemble approach\nproved to be the most effective, achieving 95% accuracy and an F1 score going\nup to 0.96 for individual classes. Tuning hyperparameters of individual models\nfor optimization was critical to maximize the models' performance. The stacking\nof CNNs with XGBoost acting as the meta-model utilizes the strengths of the CNN\nand ResNet models to improve the overall accuracy of the classification.\nResults obtained from the models illustrated the potency of CNN-based models\nfor automated disaster image classification. This lays the foundation for\nexpanding these techniques to build robust systems for disaster response,\ndamage assessment, and recovery management.\n","authors":["Archit Rathod","Veer Pariawala","Mokshit Surana","Kumkum Saxena"],"pdf_url":"https://arxiv.org/pdf/2311.13531v1.pdf","comment":"13 pages, 11 figures, 4 tables, ICSISCET 2023 Conference"},{"id":"http://arxiv.org/abs/2311.11772v2","updated":"2023-11-22T17:06:31Z","published":"2023-11-20T13:58:26Z","title":"A Good Feature Extractor Is All You Need for Weakly Supervised Learning\n in Histopathology","summary":" Deep learning is revolutionising pathology, offering novel opportunities in\ndisease prognosis and personalised treatment. Historically, stain normalisation\nhas been a crucial preprocessing step in computational pathology pipelines, and\npersists into the deep learning era. Yet, with the emergence of feature\nextractors trained using self-supervised learning (SSL) on diverse pathology\ndatasets, we call this practice into question. In an empirical evaluation of\npublicly available feature extractors, we find that omitting stain\nnormalisation and image augmentations does not compromise downstream\nperformance, while incurring substantial savings in memory and compute.\nFurther, we show that the top-performing feature extractors are remarkably\nrobust to variations in stain and augmentations like rotation in their latent\nspace. Contrary to previous patch-level benchmarking studies, our approach\nemphasises clinical relevance by focusing on slide-level prediction tasks in a\nweakly supervised setting with external validation cohorts. This work\nrepresents the most comprehensive robustness evaluation of public pathology SSL\nfeature extractors to date, involving more than 6,000 training runs across nine\ntasks, five datasets, three downstream architectures, and various preprocessing\nsetups. Our findings stand to streamline digital pathology workflows by\nminimising preprocessing needs and informing the selection of feature\nextractors.\n","authors":["Georg Wölflein","Dyke Ferber","Asier Rabasco Meneghetti","Omar S. M. El Nahhas","Daniel Truhn","Zunamys I. Carrero","David J. Harrison","Ognjen Arandjelović","Jakob N. Kather"],"pdf_url":"https://arxiv.org/pdf/2311.11772v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11284v2","updated":"2023-11-22T16:54:17Z","published":"2023-11-19T09:59:09Z","title":"LucidDreamer: Towards High-Fidelity Text-to-3D Generation via Interval\n Score Matching","summary":" The recent advancements in text-to-3D generation mark a significant milestone\nin generative models, unlocking new possibilities for creating imaginative 3D\nassets across various real-world scenarios. While recent advancements in\ntext-to-3D generation have shown promise, they often fall short in rendering\ndetailed and high-quality 3D models. This problem is especially prevalent as\nmany methods base themselves on Score Distillation Sampling (SDS). This paper\nidentifies a notable deficiency in SDS, that it brings inconsistent and\nlow-quality updating direction for the 3D model, causing the over-smoothing\neffect. To address this, we propose a novel approach called Interval Score\nMatching (ISM). ISM employs deterministic diffusing trajectories and utilizes\ninterval-based score matching to counteract over-smoothing. Furthermore, we\nincorporate 3D Gaussian Splatting into our text-to-3D generation pipeline.\nExtensive experiments show that our model largely outperforms the\nstate-of-the-art in quality and training efficiency.\n","authors":["Yixun Liang","Xin Yang","Jiantao Lin","Haodong Li","Xiaogang Xu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.11284v2.pdf","comment":"The first two authors contributed equally to this work. Our code will\n be available at: https://github.com/EnVision-Research/LucidDreamer"},{"id":"http://arxiv.org/abs/2311.13512v1","updated":"2023-11-22T16:35:43Z","published":"2023-11-22T16:35:43Z","title":"Hybrid Whale-Mud-Ring Optimization for Precise Color Skin Cancer Image\n Segmentation","summary":" Timely identification and treatment of rapidly progressing skin cancers can\nsignificantly contribute to the preservation of patients' health and\nwell-being. Dermoscopy, a dependable and accessible tool, plays a pivotal role\nin the initial stages of skin cancer detection. Consequently, the effective\nprocessing of digital dermoscopy images holds significant importance in\nelevating the accuracy of skin cancer diagnoses. Multilevel thresholding is a\nkey tool in medical imaging that extracts objects within the image to\nfacilitate its analysis. In this paper, an enhanced version of the Mud Ring\nAlgorithm hybridized with the Whale Optimization Algorithm, named WMRA, is\nproposed. The proposed approach utilizes bubble-net attack and mud ring\nstrategy to overcome stagnation in local optima and obtain optimal thresholds.\nThe experimental results show that WMRA is powerful against a cluster of recent\nmethods in terms of fitness, Peak Signal to Noise Ratio (PSNR), and Mean Square\nError (MSE).\n","authors":["Amir Hamza","Badis Lekouaghet","Yassine Himeur"],"pdf_url":"https://arxiv.org/pdf/2311.13512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15755v2","updated":"2023-11-22T16:16:43Z","published":"2023-06-27T19:15:06Z","title":"Adversarial Backdoor Attack by Naturalistic Data Poisoning on Trajectory\n Prediction in Autonomous Driving","summary":" In autonomous driving, behavior prediction is fundamental for safe motion\nplanning, hence the security and robustness of prediction models against\nadversarial attacks are of paramount importance. We propose a novel adversarial\nbackdoor attack against trajectory prediction models as a means of studying\ntheir potential vulnerabilities. Our attack affects the victim at training time\nvia naturalistic, hence stealthy, poisoned samples crafted using a novel\ntwo-step approach. First, the triggers are crafted by perturbing the trajectory\nof attacking vehicle and then disguised by transforming the scene using a\nbi-level optimization technique. The proposed attack does not depend on a\nparticular model architecture and operates in a black-box manner, thus can be\neffective without any knowledge of the victim model. We conduct extensive\nempirical studies using state-of-the-art prediction models on two benchmark\ndatasets using metrics customized for trajectory prediction. We show that the\nproposed attack is highly effective, as it can significantly hinder the\nperformance of prediction models, unnoticeable by the victims, and efficient as\nit forces the victim to generate malicious behavior even under constrained\nconditions. Via ablative studies, we analyze the impact of different attack\ndesign choices followed by an evaluation of existing defence mechanisms against\nthe proposed attack.\n","authors":["Mozhgan Pourkeshavarz","Mohammad Sabokrou","Amir Rasouli"],"pdf_url":"https://arxiv.org/pdf/2306.15755v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13485v1","updated":"2023-11-22T16:01:44Z","published":"2023-11-22T16:01:44Z","title":"Deep-learning-based acceleration of MRI for radiotherapy planning of\n pediatric patients with brain tumors","summary":" Magnetic Resonance Imaging (MRI) is a non-invasive diagnostic and\nradiotherapy (RT) planning tool, offering detailed insights into the anatomy of\nthe human body. The extensive scan time is stressful for patients, who must\nremain motionless in a prolonged imaging procedure that prioritizes reduction\nof imaging artifacts. This is challenging for pediatric patients who may\nrequire measures for managing voluntary motions such as anesthesia. Several\ncomputational approaches reduce scan time (fast MRI), by recording fewer\nmeasurements and digitally recovering full information via post-acquisition\nreconstruction. However, most fast MRI approaches were developed for diagnostic\nimaging, without addressing reconstruction challenges specific to RT planning.\nIn this work, we developed a deep learning-based method (DeepMRIRec) for MRI\nreconstruction from undersampled data acquired with RT-specific receiver coil\narrangements. We evaluated our method against fully sampled data of T1-weighted\nMR images acquired from 73 children with brain tumors/surgical beds using loop\nand posterior coils (12 channels), with and without applying virtual\ncompression of coil elements. DeepMRIRec reduced scanning time by a factor of\nfour producing a structural similarity score surpassing the evaluated\nstate-of-the-art method (0.960 vs 0.896), thereby demonstrating its potential\nfor accelerating MRI scanning for RT planning.\n","authors":["Shahinur Alam","Jinsoo Uh","Alexander Dresner","Chia-ho Hua","Khaled Khairy"],"pdf_url":"https://arxiv.org/pdf/2311.13485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03105v2","updated":"2023-11-22T15:46:00Z","published":"2023-11-06T13:54:52Z","title":"Pelvic floor MRI segmentation based on semi-supervised deep learning","summary":" The semantic segmentation of pelvic organs via MRI has important clinical\nsignificance. Recently, deep learning-enabled semantic segmentation has\nfacilitated the three-dimensional geometric reconstruction of pelvic floor\norgans, providing clinicians with accurate and intuitive diagnostic results.\nHowever, the task of labeling pelvic floor MRI segmentation, typically\nperformed by clinicians, is labor-intensive and costly, leading to a scarcity\nof labels. Insufficient segmentation labels limit the precise segmentation and\nreconstruction of pelvic floor organs. To address these issues, we propose a\nsemi-supervised framework for pelvic organ segmentation. The implementation of\nthis framework comprises two stages. In the first stage, it performs\nself-supervised pre-training using image restoration tasks. Subsequently,\nfine-tuning of the self-supervised model is performed, using labeled data to\ntrain the segmentation model. In the second stage, the self-supervised\nsegmentation model is used to generate pseudo labels for unlabeled data.\nUltimately, both labeled and unlabeled data are utilized in semi-supervised\ntraining. Upon evaluation, our method significantly enhances the performance in\nthe semantic segmentation and geometric reconstruction of pelvic organs, Dice\ncoefficient can increase by 2.65% averagely. Especially for organs that are\ndifficult to segment, such as the uterus, the accuracy of semantic segmentation\ncan be improved by up to 3.70%.\n","authors":["Jianwei Zuo","Fei Feng","Zhuhui Wang","James A. Ashton-Miller","John O. L. Delancey","Jiajia Luo"],"pdf_url":"https://arxiv.org/pdf/2311.03105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.02673v2","updated":"2023-11-22T15:27:53Z","published":"2021-07-06T15:27:00Z","title":"Attention-based Adversarial Appearance Learning of Augmented Pedestrians","summary":" Synthetic data became already an essential component of machine\nlearning-based perception in the field of autonomous driving. Yet it still\ncannot replace real data completely due to the sim2real domain shift. In this\nwork, we propose a method that leverages the advantages of the augmentation\nprocess and adversarial training to synthesize realistic data for the\npedestrian recognition task. Our approach utilizes an attention mechanism\ndriven by an adversarial loss to learn domain discrepancies and improve\nsim2real adaptation. Our experiments confirm that the proposed adaptation\nmethod is robust to such discrepancies and reveals both visual realism and\nsemantic consistency. Furthermore, we evaluate our data generation pipeline on\nthe task of pedestrian recognition and demonstrate that generated data resemble\nproperties of the real domain.\n","authors":["Kevin Strauss","Artem Savkin","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2107.02673v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02931v3","updated":"2023-11-22T15:23:44Z","published":"2022-12-06T12:40:45Z","title":"Leveraging Different Learning Styles for Improved Knowledge Distillation\n in Biomedical Imaging","summary":" Learning style refers to a type of training mechanism adopted by an\nindividual to gain new knowledge. As suggested by the VARK model, humans have\ndifferent learning preferences, like Visual (V), Auditory (A), Read/Write (R),\nand Kinesthetic (K), for acquiring and effectively processing information. Our\nwork endeavors to leverage this concept of knowledge diversification to improve\nthe performance of model compression techniques like Knowledge Distillation\n(KD) and Mutual Learning (ML). Consequently, we use a single-teacher and\ntwo-student network in a unified framework that not only allows for the\ntransfer of knowledge from teacher to students (KD) but also encourages\ncollaborative learning between students (ML). Unlike the conventional approach,\nwhere the teacher shares the same knowledge in the form of predictions or\nfeature representations with the student network, our proposed approach employs\na more diversified strategy by training one student with predictions and the\nother with feature maps from the teacher. We further extend this knowledge\ndiversification by facilitating the exchange of predictions and feature maps\nbetween the two student networks, enriching their learning experiences. We have\nconducted comprehensive experiments with three benchmark datasets for both\nclassification and segmentation tasks using two different network architecture\ncombinations. These experimental results demonstrate that knowledge\ndiversification in a combined KD and ML framework outperforms conventional KD\nor ML techniques (with similar network configuration) that only use predictions\nwith an average improvement of 2%. Furthermore, consistent improvement in\nperformance across different tasks, with various network architectures, and\nover state-of-the-art techniques establishes the robustness and\ngeneralizability of the proposed model\n","authors":["Usma Niyaz","Abhishek Singh Sambyal","Deepti R. Bathula"],"pdf_url":"https://arxiv.org/pdf/2212.02931v3.pdf","comment":"Accepted in Computers in Biology and Medicine"},{"id":"http://arxiv.org/abs/2311.13444v1","updated":"2023-11-22T15:09:59Z","published":"2023-11-22T15:09:59Z","title":"SkeletonGait: Gait Recognition Using Skeleton Maps","summary":" The choice of the representations is essential for deep gait recognition\nmethods. The binary silhouettes and skeletal coordinates are two dominant\nrepresentations in recent literature, achieving remarkable advances in many\nscenarios. However, inherent challenges remain, in which silhouettes are not\nalways guaranteed in unconstrained scenes, and structural cues have not been\nfully utilized from skeletons. In this paper, we introduce a novel skeletal\ngait representation named Skeleton Map, together with SkeletonGait, a\nskeleton-based method to exploit structural information from human skeleton\nmaps. Specifically, the skeleton map represents the coordinates of human joints\nas a heatmap with Gaussian approximation, exhibiting a silhouette-like image\ndevoid of exact body structure. Beyond achieving state-of-the-art performances\nover five popular gait datasets, more importantly, SkeletonGait uncovers novel\ninsights about how important structural features are in describing gait and\nwhen do they play a role. Furthermore, we propose a multi-branch architecture,\nnamed SkeletonGait++, to make use of complementary features from both skeletons\nand silhouettes. Experiments indicate that SkeletonGait++ outperforms existing\nstate-of-the-art methods by a significant margin in various scenarios. For\ninstance, it achieves an impressive rank-1 accuracy of over $85\\%$ on the\nchallenging GREW dataset. All the source code will be available at\nhttps://github.com/ShiqiYu/OpenGait.\n","authors":["Chao Fan","Jingzhe Ma","Dongyang Jin","Chuanfu Shen","Shiqi Yu"],"pdf_url":"https://arxiv.org/pdf/2311.13444v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13443v1","updated":"2023-11-22T15:07:59Z","published":"2023-11-22T15:07:59Z","title":"Guided Flows for Generative Modeling and Decision Making","summary":" Classifier-free guidance is a key component for improving the performance of\nconditional generative models for many downstream tasks. It drastically\nimproves the quality of samples produced, but has so far only been used for\ndiffusion models. Flow Matching (FM), an alternative simulation-free approach,\ntrains Continuous Normalizing Flows (CNFs) based on regressing vector fields.\nIt remains an open question whether classifier-free guidance can be performed\nfor Flow Matching models, and to what extent does it improve performance. In\nthis paper, we explore the usage of Guided Flows for a variety of downstream\napplications involving conditional image generation, speech synthesis, and\nreinforcement learning. In particular, we are the first to apply flow models to\nthe offline reinforcement learning setting. We also show that Guided Flows\nsignificantly improves the sample quality in image generation and zero-shot\ntext-to-speech synthesis, and can make use of drastically low amounts of\ncomputation without affecting the agent's overall performance.\n","authors":["Qinqing Zheng","Matt Le","Neta Shaul","Yaron Lipman","Aditya Grover","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2311.13443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13435v1","updated":"2023-11-22T14:48:30Z","published":"2023-11-22T14:48:30Z","title":"PG-Video-LLaVA: Pixel Grounding Large Video-Language Models","summary":" Extending image-based Large Multimodal Models (LMM) to videos is challenging\ndue to the inherent complexity of video data. The recent approaches extending\nimage-based LMM to videos either lack the grounding capabilities (e.g.,\nVideoChat, Video-ChatGPT, Video-LLaMA) or do not utilize the audio-signals for\nbetter video understanding (e.g., Video-ChatGPT). Addressing these gaps, we\npropose Video-LLaVA, the first LMM with pixel-level grounding capability,\nintegrating audio cues by transcribing them into text to enrich video-context\nunderstanding. Our framework uses an off-the-shelf tracker and a novel\ngrounding module, enabling it to spatially and temporally localize objects in\nvideos following user instructions. We evaluate Video-LLaVA using video-based\ngenerative and question-answering benchmarks and introduce new benchmarks\nspecifically designed to measure prompt-based object grounding performance in\nvideos. Further, we propose the use of Vicuna over GPT-3.5, as utilized in\nVideo-ChatGPT, for video-based conversation benchmarking, ensuring\nreproducibility of results which is a concern with the proprietary nature of\nGPT-3.5. Our framework builds on SoTA image-based LLaVA model and extends its\nadvantages to the video domain, delivering promising gains on video-based\nconversation and grounding tasks. Project Page:\nhttps://github.com/mbzuai-oryx/Video-LLaVA\n","authors":["Shehan Munasinghe","Rusiru Thushara","Muhammad Maaz","Hanoona Abdul Rasheed","Salman Khan","Mubarak Shah","Fahad Khan"],"pdf_url":"https://arxiv.org/pdf/2311.13435v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2311.08936v2","updated":"2023-11-22T14:25:55Z","published":"2023-11-15T13:19:02Z","title":"Confident Naturalness Explanation (CNE): A Framework to Explain and\n Assess Patterns Forming Naturalness","summary":" Protected natural areas are regions that have been minimally affected by\nhuman activities such as urbanization, agriculture, and other human\ninterventions. To better understand and map the naturalness of these areas,\nmachine learning models can be used to analyze satellite imagery. Specifically,\nexplainable machine learning methods show promise in uncovering patterns that\ncontribute to the concept of naturalness within these protected environments.\nAdditionally, addressing the uncertainty inherent in machine learning models is\ncrucial for a comprehensive understanding of this concept. However, existing\napproaches have limitations. They either fail to provide explanations that are\nboth valid and objective or struggle to offer a quantitative metric that\naccurately measures the contribution of specific patterns to naturalness, along\nwith the associated confidence. In this paper, we propose a novel framework\ncalled the Confident Naturalness Explanation (CNE) framework. This framework\ncombines explainable machine learning and uncertainty quantification to assess\nand explain naturalness. We introduce a new quantitative metric that describes\nthe confident contribution of patterns to the concept of naturalness.\nFurthermore, we generate an uncertainty-aware segmentation mask for each input\nsample, highlighting areas where the model lacks knowledge. To demonstrate the\neffectiveness of our framework, we apply it to a study site in Fennoscandia\nusing two open-source satellite datasets.\n","authors":["Ahmed Emam","Mohamed Farag","Ribana Roscher"],"pdf_url":"https://arxiv.org/pdf/2311.08936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13409v1","updated":"2023-11-22T14:13:27Z","published":"2023-11-22T14:13:27Z","title":"CompenHR: Efficient Full Compensation for High-resolution Projector","summary":" Full projector compensation is a practical task of projector-camera systems.\nIt aims to find a projector input image, named compensation image, such that\nwhen projected it cancels the geometric and photometric distortions due to the\nphysical environment and hardware. State-of-the-art methods use deep learning\nto address this problem and show promising performance for low-resolution\nsetups. However, directly applying deep learning to high-resolution setups is\nimpractical due to the long training time and high memory cost. To address this\nissue, this paper proposes a practical full compensation solution. Firstly, we\ndesign an attention-based grid refinement network to improve geometric\ncorrection quality. Secondly, we integrate a novel sampling scheme into an\nend-to-end compensation network to alleviate computation and introduce\nattention blocks to preserve key features. Finally, we construct a benchmark\ndataset for high-resolution projector full compensation. In experiments, our\nmethod demonstrates clear advantages in both efficiency and quality.\n","authors":["Yuxi Wang","Haibin Ling","Bingyao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13404v1","updated":"2023-11-22T14:00:23Z","published":"2023-11-22T14:00:23Z","title":"Animatable 3D Gaussians for High-fidelity Synthesis of Human Motions","summary":" We present a novel animatable 3D Gaussian model for rendering high-fidelity\nfree-view human motions in real time. Compared to existing NeRF-based methods,\nthe model owns better capability in synthesizing high-frequency details without\nthe jittering problem across video frames. The core of our model is a novel\naugmented 3D Gaussian representation, which attaches each Gaussian with a\nlearnable code. The learnable code serves as a pose-dependent appearance\nembedding for refining the erroneous appearance caused by geometric\ntransformation of Gaussians, based on which an appearance refinement model is\nlearned to produce residual Gaussian properties to match the appearance in\ntarget pose. To force the Gaussians to learn the foreground human only without\nbackground interference, we further design a novel alpha loss to explicitly\nconstrain the Gaussians within the human body. We also propose to jointly\noptimize the human joint parameters to improve the appearance accuracy. The\nanimatable 3D Gaussian model can be learned with shallow MLPs, so new human\nmotions can be synthesized in real time (66 fps on avarage). Experiments show\nthat our model has superior performance over NeRF-based methods.\n","authors":["Keyang Ye","Tianjia Shao","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.13404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02185v4","updated":"2023-11-22T13:56:48Z","published":"2023-09-05T12:42:26Z","title":"BEVTrack: A Simple and Strong Baseline for 3D Single Object Tracking in\n Bird's-Eye View","summary":" 3D Single Object Tracking (SOT) is a fundamental task of computer vision,\nproving essential for applications like autonomous driving. It remains\nchallenging to localize the target from surroundings due to appearance\nvariations, distractors, and the high sparsity of point clouds. The spatial\ninformation indicating objects' spatial adjacency across consecutive frames is\ncrucial for effective object tracking. However, existing trackers typically\nemploy point-wise representation with irregular formats, leading to\ninsufficient use of this important spatial knowledge. As a result, these\ntrackers usually require elaborate designs and solving multiple subtasks. In\nthis paper, we propose BEVTrack, a simple yet effective baseline that performs\ntracking in Bird's-Eye View (BEV). This representation greatly retains spatial\ninformation owing to its ordered structure and inherently encodes the implicit\nmotion relations of the target as well as distractors. To achieve accurate\nregression for targets with diverse attributes (\\textit{e.g.}, sizes and motion\npatterns), BEVTrack constructs the likelihood function with the learned\nunderlying distributions adapted to different targets, rather than making a\nfixed Laplace or Gaussian assumption as in previous works. This provides\nvaluable priors for tracking and thus further boosts performance. While only\nusing a single regression loss with a plain convolutional architecture,\nBEVTrack achieves state-of-the-art performance on three large-scale datasets,\nKITTI, NuScenes, and Waymo Open Dataset while maintaining a high inference\nspeed of about 200 FPS. The code will be released at\nhttps://github.com/xmm-prio/BEVTrack.\n","authors":["Yuxiang Yang","Yingqi Deng","Jing Zhang","Jiahao Nie","Zheng-Jun Zha"],"pdf_url":"https://arxiv.org/pdf/2309.02185v4.pdf","comment":"The code will be released at https://github.com/xmm-prio/BEVTrack"},{"id":"http://arxiv.org/abs/2311.13398v1","updated":"2023-11-22T13:53:04Z","published":"2023-11-22T13:53:04Z","title":"Depth-Regularized Optimization for 3D Gaussian Splatting in Few-Shot\n Images","summary":" In this paper, we present a method to optimize Gaussian splatting with a\nlimited number of images while avoiding overfitting. Representing a 3D scene by\ncombining numerous Gaussian splats has yielded outstanding visual quality.\nHowever, it tends to overfit the training views when only a small number of\nimages are available. To address this issue, we introduce a dense depth map as\na geometry guide to mitigate overfitting. We obtained the depth map using a\npre-trained monocular depth estimation model and aligning the scale and offset\nusing sparse COLMAP feature points. The adjusted depth aids in the color-based\noptimization of 3D Gaussian splatting, mitigating floating artifacts, and\nensuring adherence to geometric constraints. We verify the proposed method on\nthe NeRF-LLFF dataset with varying numbers of few images. Our approach\ndemonstrates robust geometry compared to the original method that relies solely\non images.\n","authors":["Jaeyoung Chung","Jeongtaek Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2311.13398v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.13385v1","updated":"2023-11-22T13:27:36Z","published":"2023-11-22T13:27:36Z","title":"SegVol: Universal and Interactive Volumetric Medical Image Segmentation","summary":" Precise image segmentation provides clinical study with meaningful and\nwell-structured information. Despite the remarkable progress achieved in\nmedical image segmentation, there is still an absence of foundation\nsegmentation model that can segment a wide range of anatomical categories with\neasy user interaction. In this paper, we propose a universal and interactive\nvolumetric medical image segmentation model, named SegVol. By training on 90k\nunlabeled Computed Tomography (CT) volumes and 6k labeled CTs, this foundation\nmodel supports the segmentation of over 200 anatomical categories using\nsemantic and spatial prompts. Extensive experiments verify that SegVol\noutperforms the state of the art by a large margin on multiple segmentation\nbenchmarks. Notably, on three challenging lesion datasets, our method achieves\naround 20% higher Dice score than nnU-Net. The model and data are publicly\navailable at: https://github.com/BAAI-DCAI/SegVol.\n","authors":["Yuxin Du","Fan Bai","Tiejun Huang","Bo Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.13385v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13384v1","updated":"2023-11-22T13:27:34Z","published":"2023-11-22T13:27:34Z","title":"LucidDreamer: Domain-free Generation of 3D Gaussian Splatting Scenes","summary":" With the widespread usage of VR devices and contents, demands for 3D scene\ngeneration techniques become more popular. Existing 3D scene generation models,\nhowever, limit the target scene to specific domain, primarily due to their\ntraining strategies using 3D scan dataset that is far from the real-world. To\naddress such limitation, we propose LucidDreamer, a domain-free scene\ngeneration pipeline by fully leveraging the power of existing large-scale\ndiffusion-based generative model. Our LucidDreamer has two alternate steps:\nDreaming and Alignment. First, to generate multi-view consistent images from\ninputs, we set the point cloud as a geometrical guideline for each image\ngeneration. Specifically, we project a portion of point cloud to the desired\nview and provide the projection as a guidance for inpainting using the\ngenerative model. The inpainted images are lifted to 3D space with estimated\ndepth maps, composing a new points. Second, to aggregate the new points into\nthe 3D scene, we propose an aligning algorithm which harmoniously integrates\nthe portions of newly generated 3D scenes. The finally obtained 3D scene serves\nas initial points for optimizing Gaussian splats. LucidDreamer produces\nGaussian splats that are highly-detailed compared to the previous 3D scene\ngeneration methods, with no constraint on domain of the target scene.\n","authors":["Jaeyoung Chung","Suyoung Lee","Hyeongjin Nam","Jaerin Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2311.13384v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13378v1","updated":"2023-11-22T13:19:41Z","published":"2023-11-22T13:19:41Z","title":"Point Projection Mapping System for Tracking, Registering, Labeling and\n Validating Optical Tissue Measurements","summary":" Validation of newly developed optical tissue sensing techniques for tumor\ndetection during cancer surgery requires an accurate correlation with\nhistological results. Additionally, such accurate correlation facilitates\nprecise data labeling for developing high-performance machine-learning tissue\nclassification models. In this paper, a newly developed Point Projection\nMapping system will be introduced, which allows non-destructive tracking of the\nmeasurement locations on tissue specimens. Additionally, a framework for\naccurate registration, validation, and labeling with histopathology results is\nproposed and validated on a case study. The proposed framework provides a more\nrobust and accurate method for tracking and validation of optical tissue\nsensing techniques, which saves time and resources compared to conventional\ntechniques available.\n","authors":["Lianne Feenstra","Stefan D. van der Stel","Marcos Da Silva Guimaraes","Theo J. M Ruers","Behdad Dashtbozorg"],"pdf_url":"https://arxiv.org/pdf/2311.13378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13372v1","updated":"2023-11-22T13:13:19Z","published":"2023-11-22T13:13:19Z","title":"MRGazer: Decoding Eye Gaze Points from Functional Magnetic Resonance\n Imaging in Individual Space","summary":" Eye-tracking research has proven valuable in understanding numerous cognitive\nfunctions. Recently, Frey et al. provided an exciting deep learning method for\nlearning eye movements from fMRI data. However, it needed to co-register fMRI\ninto standard space to obtain eyeballs masks, and thus required additional\ntemplates and was time consuming. To resolve this issue, in this paper, we\npropose a framework named MRGazer for predicting eye gaze points from fMRI in\nindividual space. The MRGazer consisted of eyeballs extraction module and a\nresidual network-based eye gaze prediction. Compared to the previous method,\nthe proposed framework skips the fMRI co-registration step, simplifies the\nprocessing protocol and achieves end-to-end eye gaze regression. The proposed\nmethod achieved superior performance in a variety of eye movement tasks than\nthe co-registration-based method, and delivered objective results within a\nshorter time (~ 0.02 Seconds for each volume) than prior method (~0.3 Seconds\nfor each volume).\n","authors":["Xiuwen Wu","Rongjie Hu","Jie Liang","Yanming Wang","Bensheng Qiu","Xiaoxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11317v2","updated":"2023-11-22T12:52:41Z","published":"2023-11-19T13:07:06Z","title":"Discrete approximations of Gaussian smoothing and Gaussian derivatives","summary":" This paper develops an in-depth treatment concerning the problem of\napproximating the Gaussian smoothing and Gaussian derivative computations in\nscale-space theory for application on discrete data. With close connections to\nprevious axiomatic treatments of continuous and discrete scale-space theory, we\nconsider three main ways discretizing these scale-space operations in terms of\nexplicit discrete convolutions, based on either (i) sampling the Gaussian\nkernels and the Gaussian derivative kernels, (ii) locally integrating the\nGaussian kernels and the Gaussian derivative kernels over each pixel support\nregion and (iii) basing the scale-space analysis on the discrete analogue of\nthe Gaussian kernel, and then computing derivative approximations by applying\nsmall-support central difference operators to the spatially smoothed image\ndata.\n We study the properties of these three main discretization methods both\ntheoretically and experimentally, and characterize their performance by\nquantitative measures, including the results they give rise to with respect to\nthe task of scale selection, investigated for four different use cases, and\nwith emphasis on the behaviour at fine scales. The results show that the\nsampled Gaussian kernels and derivatives as well as the integrated Gaussian\nkernels and derivatives perform very poorly at very fine scales. At very fine\nscales, the discrete analogue of the Gaussian kernel with its corresponding\ndiscrete derivative approximations performs substantially better. The sampled\nGaussian kernel and the sampled Gaussian derivatives do, on the other hand,\nlead to numerically very good approximations of the corresponding continuous\nresults, when the scale parameter is sufficiently large, in the experiments\npresented in the paper, when the scale parameter is greater than a value of\nabout 1, in units of the grid spacing.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.11317v2.pdf","comment":"38 pages, 34 figures"},{"id":"http://arxiv.org/abs/2311.13355v1","updated":"2023-11-22T12:47:12Z","published":"2023-11-22T12:47:12Z","title":"Unified Classification and Rejection: A One-versus-All Framework","summary":" Classifying patterns of known classes and rejecting ambiguous and novel (also\ncalled as out-of-distribution (OOD)) inputs are involved in open world pattern\nrecognition. Deep neural network models usually excel in closed-set\nclassification while performing poorly in rejecting OOD. To tackle this\nproblem, numerous methods have been designed to perform open set recognition\n(OSR) or OOD rejection/detection tasks. Previous methods mostly take\npost-training score transformation or hybrid models to ensure low scores on OOD\ninputs while separating known classes. In this paper, we attempt to build a\nunified framework for building open set classifiers for both classification and\nOOD rejection. We formulate the open set recognition of $ K $-known-class as a\n$ (K + 1) $-class classification problem with model trained on known-class\nsamples only. By decomposing the $ K $-class problem into $ K $ one-versus-all\n(OVA) binary classification tasks and binding some parameters, we show that\ncombining the scores of OVA classifiers can give $ (K + 1) $-class posterior\nprobabilities, which enables classification and OOD rejection in a unified\nframework. To maintain the closed-set classification accuracy of the OVA\ntrained classifier, we propose a hybrid training strategy combining OVA loss\nand multi-class cross-entropy loss. We implement the OVA framework and hybrid\ntraining strategy on the recently proposed convolutional prototype network.\nExperiments on popular OSR and OOD detection datasets demonstrate that the\nproposed framework, using a single multi-class classifier, yields competitive\nperformance in closed-set classification, OOD detection, and misclassification\ndetection.\n","authors":["Zhen Cheng","Xu-Yao Zhang","Cheng-Lin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.13355v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04225v2","updated":"2023-11-22T12:35:08Z","published":"2023-06-07T08:02:17Z","title":"Efficient Vision Transformer for Human Pose Estimation via Patch\n Selection","summary":" While Convolutional Neural Networks (CNNs) have been widely successful in 2D\nhuman pose estimation, Vision Transformers (ViTs) have emerged as a promising\nalternative to CNNs, boosting state-of-the-art performance. However, the\nquadratic computational complexity of ViTs has limited their applicability for\nprocessing high-resolution images. In this paper, we propose three methods for\nreducing ViT's computational complexity, which are based on selecting and\nprocessing a small number of most informative patches while disregarding\nothers. The first two methods leverage a lightweight pose estimation network to\nguide the patch selection process, while the third method utilizes a set of\nlearnable joint tokens to ensure that the selected patches contain the most\nimportant information about body joints. Experiments across six benchmarks show\nthat our proposed methods achieve a significant reduction in computational\ncomplexity, ranging from 30% to 44%, with only a minimal drop in accuracy\nbetween 0% and 3.5%.\n","authors":["Kaleab A. Kinfu","Rene Vidal"],"pdf_url":"https://arxiv.org/pdf/2306.04225v2.pdf","comment":"BMVC 2023 Oral Paper: https://proceedings.bmvc2023.org/167/"},{"id":"http://arxiv.org/abs/2211.14605v2","updated":"2023-11-22T12:16:28Z","published":"2022-11-26T16:13:32Z","title":"Looking at the posterior: accuracy and uncertainty of neural-network\n predictions","summary":" Bayesian inference can quantify uncertainty in the predictions of neural\nnetworks using posterior distributions for model parameters and network output.\nBy looking at these posterior distributions, one can separate the origin of\nuncertainty into aleatoric and epistemic contributions. One goal of uncertainty\nquantification is to inform on prediction accuracy. Here we show that\nprediction accuracy depends on both epistemic and aleatoric uncertainty in an\nintricate fashion that cannot be understood in terms of marginalized\nuncertainty distributions alone. How the accuracy relates to epistemic and\naleatoric uncertainties depends not only on the model architecture, but also on\nthe properties of the dataset. We discuss the significance of these results for\nactive learning and introduce a novel acquisition function that outperforms\ncommon uncertainty-based methods. To arrive at our results, we approximated the\nposteriors using deep ensembles, for fully-connected, convolutional and\nattention-based neural networks.\n","authors":["H. Linander","O. Balabanov","H. Yang","B. Mehlig"],"pdf_url":"https://arxiv.org/pdf/2211.14605v2.pdf","comment":"26 pages, 10 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.13338v1","updated":"2023-11-22T12:03:33Z","published":"2023-11-22T12:03:33Z","title":"High-Quality Face Caricature via Style Translation","summary":" Caricature is an exaggerated form of artistic portraiture that accentuates\nunique yet subtle characteristics of human faces. Recently, advancements in\ndeep end-to-end techniques have yielded encouraging outcomes in capturing both\nstyle and elevated exaggerations in creating face caricatures. Most of these\napproaches tend to produce cartoon-like results that could be more practical\nfor real-world applications. In this study, we proposed a high-quality,\nunpaired face caricature method that is appropriate for use in the real world\nand uses computer vision techniques and GAN models. We attain the exaggeration\nof facial features and the stylization of appearance through a two-step\nprocess: Face caricature generation and face caricature projection. The face\ncaricature generation step creates new caricature face datasets from real\nimages and trains a generative model using the real and newly created\ncaricature datasets. The Face caricature projection employs an encoder trained\nwith real and caricature faces with the pretrained generator to project real\nand caricature faces. We perform an incremental facial exaggeration from the\nreal image to the caricature faces using the encoder and generator's latent\nspace. Our projection preserves the facial identity, attributes, and\nexpressions from the input image. Also, it accounts for facial occlusions, such\nas reading glasses or sunglasses, to enhance the robustness of our model.\nFurthermore, we conducted a comprehensive comparison of our approach with\nvarious state-of-the-art face caricature methods, highlighting our process's\ndistinctiveness and exceptional realism.\n","authors":["Lamyanba Laishram","Muhammad Shaheryar","Jong Taek Lee","Soon Ki Jung"],"pdf_url":"https://arxiv.org/pdf/2311.13338v1.pdf","comment":"14 pages, 21 figures"},{"id":"http://arxiv.org/abs/2311.13335v1","updated":"2023-11-22T11:55:41Z","published":"2023-11-22T11:55:41Z","title":"Quantum learning and essential cognition under the traction of\n meta-characteristics in an open world","summary":" Artificial intelligence has made significant progress in the Close World\nproblem, being able to accurately recognize old knowledge through training and\nclassification. However, AI faces significant challenges in the Open World\nproblem, as it involves a new and unknown exploration journey. AI is not\ninherently proactive in exploration, and its challenge lies in not knowing how\nto approach and adapt to the unknown world. How do humans acquire knowledge of\nthe unknown world. Humans identify new knowledge through intrinsic cognition.\nIn the process of recognizing new colors, the cognitive cues are different from\nknown color features and involve hue, saturation, brightness, and other\ncharacteristics. When AI encounters objects with different features in the new\nworld, it faces another challenge: where are the distinguishing features\nbetween influential features of new and old objects? AI often mistakes a new\nworld's brown bear for a known dog because it has not learned the differences\nin feature distributions between knowledge systems. This is because things in\nthe new and old worlds have different units and dimensions for their features.\nThis paper proposes an open-world model and elemental feature system that\nfocuses on fundamentally recognizing the distribution differences in objective\nfeatures between the new and old worlds. The quantum tunneling effect of\nlearning ability in the new and old worlds is realized through the tractive\nforce of meta-characteristic. The outstanding performance of the model system\nin learning new knowledge (using pedestrian re-identification datasets as an\nexample) demonstrates that AI has acquired the ability to recognize the new\nworld with an accuracy of $96.71\\%$ at most and has gained the capability to\nexplore new knowledge, similar to humans.\n","authors":["Jin Wang","Changlin Song"],"pdf_url":"https://arxiv.org/pdf/2311.13335v1.pdf","comment":"8 pages,5 pages"},{"id":"http://arxiv.org/abs/2311.12437v2","updated":"2023-11-22T11:38:46Z","published":"2023-11-21T08:47:08Z","title":"Learning Site-specific Styles for Multi-institutional Unsupervised\n Cross-modality Domain Adaptation","summary":" Unsupervised cross-modality domain adaptation is a challenging task in\nmedical image analysis, and it becomes more challenging when source and target\ndomain data are collected from multiple institutions. In this paper, we present\nour solution to tackle the multi-institutional unsupervised domain adaptation\nfor the crossMoDA 2023 challenge. First, we perform unpaired image translation\nto translate the source domain images to the target domain, where we design a\ndynamic network to generate synthetic target domain images with controllable,\nsite-specific styles. Afterwards, we train a segmentation model using the\nsynthetic images and further reduce the domain gap by self-training. Our\nsolution achieved the 1st place during both the validation and testing phases\nof the challenge. The code repository is publicly available at\nhttps://github.com/MedICL-VU/crossmoda2023.\n","authors":["Han Liu","Yubo Fan","Zhoubing Xu","Benoit M. Dawant","Ipek Oguz"],"pdf_url":"https://arxiv.org/pdf/2311.12437v2.pdf","comment":"crossMoDA 2023 challenge 1st place solution"},{"id":"http://arxiv.org/abs/2201.04819v2","updated":"2023-11-22T11:32:46Z","published":"2022-01-13T07:25:06Z","title":"Deep Rank-Consistent Pyramid Model for Enhanced Crowd Counting","summary":" Most conventional crowd counting methods utilize a fully-supervised learning\nframework to establish a mapping between scene images and crowd density maps.\nThey usually rely on a large quantity of costly and time-intensive pixel-level\nannotations for training supervision. One way to mitigate the intensive\nlabeling effort and improve counting accuracy is to leverage large amounts of\nunlabeled images. This is attributed to the inherent self-structural\ninformation and rank consistency within a single image, offering additional\nqualitative relation supervision during training. Contrary to earlier methods\nthat utilized the rank relations at the original image level, we explore such\nrank-consistency relation within the latent feature spaces. This approach\nenables the incorporation of numerous pyramid partial orders, strengthening the\nmodel representation capability. A notable advantage is that it can also\nincrease the utilization ratio of unlabeled samples. Specifically, we propose a\nDeep Rank-consistEnt pyrAmid Model (DREAM), which makes full use of rank\nconsistency across coarse-to-fine pyramid features in latent spaces for\nenhanced crowd counting with massive unlabeled images. In addition, we have\ncollected a new unlabeled crowd counting dataset, FUDAN-UCC, comprising 4,000\nimages for training purposes. Extensive experiments on four benchmark datasets,\nnamely UCF-QNRF, ShanghaiTech PartA and PartB, and UCF-CC-50, show the\neffectiveness of our method compared with previous semi-supervised methods. The\ncodes are available at https://github.com/bridgeqiqi/DREAM.\n","authors":["Jiaqi Gao","Zhizhong Huang","Yiming Lei","Hongming Shan","James Z. Wang","Fei-Yue Wang","Junping Zhang"],"pdf_url":"https://arxiv.org/pdf/2201.04819v2.pdf","comment":"Accepted by IEEE Transactions on Neural Networks and Learning Systems"},{"id":"http://arxiv.org/abs/2311.13321v1","updated":"2023-11-22T11:24:04Z","published":"2023-11-22T11:24:04Z","title":"Revisiting Supervision for Continual Representation Learning","summary":" In the field of continual learning, models are designed to learn tasks one\nafter the other. While most research has centered on supervised continual\nlearning, recent studies have highlighted the strengths of self-supervised\ncontinual representation learning. The improved transferability of\nrepresentations built with self-supervised methods is often associated with the\nrole played by the multi-layer perceptron projector. In this work, we depart\nfrom this observation and reexamine the role of supervision in continual\nrepresentation learning. We reckon that additional information, such as human\nannotations, should not deteriorate the quality of representations. Our\nfindings show that supervised models when enhanced with a multi-layer\nperceptron head, can outperform self-supervised models in continual\nrepresentation learning.\n","authors":["Daniel Marczak","Sebastian Cygert","Tomasz Trzciński","Bartłomiej Twardowski"],"pdf_url":"https://arxiv.org/pdf/2311.13321v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13319v1","updated":"2023-11-22T11:15:38Z","published":"2023-11-22T11:15:38Z","title":"Deep Learning for Vascular Segmentation and Applications in Phase\n Contrast Tomography Imaging","summary":" Automated blood vessel segmentation is vital for biomedical imaging, as\nvessel changes indicate many pathologies. Still, precise segmentation is\ndifficult due to the complexity of vascular structures, anatomical variations\nacross patients, the scarcity of annotated public datasets, and the quality of\nimages. We present a thorough literature review, highlighting the state of\nmachine learning techniques across diverse organs. Our goal is to provide a\nfoundation on the topic and identify a robust baseline model for application to\nvascular segmentation in a new imaging modality, Hierarchical Phase Contrast\nTomography (HiP CT). Introduced in 2020 at the European Synchrotron Radiation\nFacility, HiP CT enables 3D imaging of complete organs at an unprecedented\nresolution of ca. 20mm per voxel, with the capability for localized zooms in\nselected regions down to 1mm per voxel without sectioning. We have created a\ntraining dataset with double annotator validated vascular data from three\nkidneys imaged with HiP CT in the context of the Human Organ Atlas Project.\nFinally, utilising the nnU Net model, we conduct experiments to assess the\nmodels performance on both familiar and unseen samples, employing vessel\nspecific metrics. Our results show that while segmentations yielded reasonably\nhigh scores such as clDice values ranging from 0.82 to 0.88, certain errors\npersisted. Large vessels that collapsed due to the lack of hydrostatic pressure\n(HiP CT is an ex vivo technique) were segmented poorly. Moreover, decreased\nconnectivity in finer vessels and higher segmentation errors at vessel\nboundaries were observed. Such errors obstruct the understanding of the\nstructures by interrupting vascular tree connectivity. Through our review and\noutputs, we aim to set a benchmark for subsequent model evaluations using\nvarious modalities, especially with the HiP CT imaging database.\n","authors":["Ekin Yagis","Shahab Aslani","Yashvardhan Jain","Yang Zhou","Shahrokh Rahmani","Joseph Brunet","Alexandre Bellier","Christopher Werlein","Maximilian Ackermann","Danny Jonigk","Paul Tafforeau","Peter D Lee","Claire Walsh"],"pdf_url":"https://arxiv.org/pdf/2311.13319v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13317v1","updated":"2023-11-22T11:10:45Z","published":"2023-11-22T11:10:45Z","title":"Recognition-Guided Diffusion Model for Scene Text Image Super-Resolution","summary":" Scene Text Image Super-Resolution (STISR) aims to enhance the resolution and\nlegibility of text within low-resolution (LR) images, consequently elevating\nrecognition accuracy in Scene Text Recognition (STR). Previous methods\npredominantly employ discriminative Convolutional Neural Networks (CNNs)\naugmented with diverse forms of text guidance to address this issue.\nNevertheless, they remain deficient when confronted with severely blurred\nimages, due to their insufficient generation capability when little structural\nor semantic information can be extracted from original images. Therefore, we\nintroduce RGDiffSR, a Recognition-Guided Diffusion model for scene text image\nSuper-Resolution, which exhibits great generative diversity and fidelity even\nin challenging scenarios. Moreover, we propose a Recognition-Guided Denoising\nNetwork, to guide the diffusion model generating LR-consistent results through\nsuccinct semantic guidance. Experiments on the TextZoom dataset demonstrate the\nsuperiority of RGDiffSR over prior state-of-the-art methods in both text\nrecognition accuracy and image fidelity.\n","authors":["Yuxuan Zhou","Liangcai Gao","Zhi Tang","Baole Wei"],"pdf_url":"https://arxiv.org/pdf/2311.13317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00846v2","updated":"2023-11-22T11:02:35Z","published":"2023-09-02T07:13:47Z","title":"pSTarC: Pseudo Source Guided Target Clustering for Fully Test-Time\n Adaptation","summary":" Test Time Adaptation (TTA) is a pivotal concept in machine learning, enabling\nmodels to perform well in real-world scenarios, where test data distribution\ndiffers from training. In this work, we propose a novel approach called pseudo\nSource guided Target Clustering (pSTarC) addressing the relatively unexplored\narea of TTA under real-world domain shifts. This method draws inspiration from\ntarget clustering techniques and exploits the source classifier for generating\npseudo-source samples. The test samples are strategically aligned with these\npseudo-source samples, facilitating their clustering and thereby enhancing TTA\nperformance. pSTarC operates solely within the fully test-time adaptation\nprotocol, removing the need for actual source data. Experimental validation on\na variety of domain shift datasets, namely VisDA, Office-Home, DomainNet-126,\nCIFAR-100C verifies pSTarC's effectiveness. This method exhibits significant\nimprovements in prediction accuracy along with efficient computational\nrequirements. Furthermore, we also demonstrate the universality of the pSTarC\nframework by showing its effectiveness for the continuous TTA framework. The\nsource code for our method is available at https://manogna-s.github.io/pstarc\n","authors":["Manogna Sreenivas","Goirik Chakrabarty","Soma Biswas"],"pdf_url":"https://arxiv.org/pdf/2309.00846v2.pdf","comment":"Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2311.13307v1","updated":"2023-11-22T10:55:36Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Reasoning and\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as an interaction between\nvision and language fields. Previous works inherited the ideology of\nvision-to-language generation tasks,aiming to generate paragraphs with high\nconsistency as reports. However, one unique characteristic of RRG, the\nindependence between diseases, was neglected, leading to the injection of the\nspurious confounder, i.e., the disease co-occurrence. Unfortunately, this\nconfounder confuses the process of report generation worse because of the\nbiased RRG data distribution. In this paper, to rethink this issue thoroughly,\nwe reason about its causes and effects from a novel perspective of statistics\nand causality, where the Joint Vision Coupling and the Conditional Sentence\nCoherence Coupling are two aspects prone to implicitly decrease the accuracy of\nreports. Then, a counterfactual augmentation strategy that contains the\nCounterfactual Sample Synthesis and the Counterfactual Report Reconstruction\nsub-methods is proposed to break these two aspects of spurious effects.\nExperimental results and further analyses on two widely used datasets justify\nour reasoning and proposed methods.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v1.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2311.13297v1","updated":"2023-11-22T10:27:19Z","published":"2023-11-22T10:27:19Z","title":"Retargeting Visual Data with Deformation Fields","summary":" Seam carving is an image editing method that enable content-aware resizing,\nincluding operations like removing objects. However, the seam-finding strategy\nbased on dynamic programming or graph-cut limits its applications to broader\nvisual data formats and degrees of freedom for editing. Our observation is that\ndescribing the editing and retargeting of images more generally by a\ndisplacement field yields a generalisation of content-aware deformations. We\npropose to learn a deformation with a neural network that keeps the output\nplausible while trying to deform it only in places with low information\ncontent. This technique applies to different kinds of visual data, including\nimages, 3D scenes given as neural radiance fields, or even polygon meshes.\nExperiments conducted on different visual data show that our method achieves\nbetter content-aware retargeting compared to previous methods.\n","authors":["Tim Elsner","Julia Berger","Tong Wu","Victor Czech","Lin Gao","Leif Kobbelt"],"pdf_url":"https://arxiv.org/pdf/2311.13297v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13289v2","updated":"2023-11-22T10:14:04Z","published":"2023-09-23T07:08:57Z","title":"USL-Net: Uncertainty Self-Learning Network for Unsupervised Skin Lesion\n Segmentation","summary":" Unsupervised skin lesion segmentation offers several benefits, including\nconserving expert human resources, reducing discrepancies due to subjective\nhuman labeling, and adapting to novel environments. However, segmenting\ndermoscopic images without manual labeling guidance presents significant\nchallenges due to dermoscopic image artifacts such as hair noise, blister\nnoise, and subtle edge differences. To address these challenges, we introduce\nan innovative Uncertainty Self-Learning Network (USL-Net) designed for skin\nlesion segmentation. The USL-Net can effectively segment a range of lesions,\neliminating the need for manual labeling guidance. Initially, features are\nextracted using contrastive learning, followed by the generation of Class\nActivation Maps (CAMs) as saliency maps using these features. The different CAM\nlocations correspond to the importance of the lesion region based on their\nsaliency. High-saliency regions in the map serve as pseudo-labels for lesion\nregions while low-saliency regions represent the background. However,\nintermediate regions can be hard to classify, often due to their proximity to\nlesion edges or interference from hair or blisters. Rather than risk potential\npseudo-labeling errors or learning confusion by forcefully classifying these\nregions, we consider them as uncertainty regions, exempting them from\npseudo-labeling and allowing the network to self-learn. Further, we employ\nconnectivity detection and centrality detection to refine foreground\npseudo-labels and reduce noise-induced errors. The application of cycle\nrefining enhances performance further. Our method underwent thorough\nexperimental validation on the ISIC-2017, ISIC-2018, and PH2 datasets,\ndemonstrating that its performance is on par with weakly supervised and\nsupervised methods, and exceeds that of other existing unsupervised methods.\n","authors":["Xiaofan Li","Bo Peng","Jie Hu","Changyou Ma","Daipeng Yang","Zhuyang Xie"],"pdf_url":"https://arxiv.org/pdf/2309.13289v2.pdf","comment":"14 pages, 9 figures, 71 references"},{"id":"http://arxiv.org/abs/2308.10631v3","updated":"2023-11-22T09:53:36Z","published":"2023-08-21T11:06:43Z","title":"PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from\n Gait","summary":" Psychological trait estimation from external factors such as movement and\nappearance is a challenging and long-standing problem in psychology, and is\nprincipally based on the psychological theory of embodiment. To date, attempts\nto tackle this problem have utilized private small-scale datasets with\nintrusive body-attached sensors. Potential applications of an automated system\nfor psychological trait estimation include estimation of occupational fatigue\nand psychology, and marketing and advertisement. In this work, we propose PsyMo\n(Psychological traits from Motion), a novel, multi-purpose and multi-modal\ndataset for exploring psychological cues manifested in walking patterns. We\ngathered walking sequences from 312 subjects in 7 different walking variations\nand 6 camera angles. In conjunction with walking sequences, participants filled\nin 6 psychological questionnaires, totalling 17 psychometric attributes related\nto personality, self-esteem, fatigue, aggressiveness and mental health. We\npropose two evaluation protocols for psychological trait estimation. Alongside\nthe estimation of self-reported psychological traits from gait, the dataset can\nbe used as a drop-in replacement to benchmark methods for gait recognition. We\nanonymize all cues related to the identity of the subjects and publicly release\nonly silhouettes, 2D / 3D human skeletons and 3D SMPL human meshes.\n","authors":["Adrian Cosma","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2308.10631v3.pdf","comment":"Accepted at 2024 IEEE/CVF Winter Conference on Applications of\n Computer Vision (WACV)"},{"id":"http://arxiv.org/abs/2311.13267v1","updated":"2023-11-22T09:37:33Z","published":"2023-11-22T09:37:33Z","title":"FedFN: Feature Normalization for Alleviating Data Heterogeneity Problem\n in Federated Learning","summary":" Federated Learning (FL) is a collaborative method for training models while\npreserving data privacy in decentralized settings. However, FL encounters\nchallenges related to data heterogeneity, which can result in performance\ndegradation. In our study, we observe that as data heterogeneity increases,\nfeature representation in the FedAVG model deteriorates more significantly\ncompared to classifier weight. Additionally, we observe that as data\nheterogeneity increases, the gap between higher feature norms for observed\nclasses, obtained from local models, and feature norms of unobserved classes\nwidens, in contrast to the behavior of classifier weight norms. This widening\ngap extends to encompass the feature norm disparities between local and the\nglobal models. To address these issues, we introduce Federated Averaging with\nFeature Normalization Update (FedFN), a straightforward learning method. We\ndemonstrate the superior performance of FedFN through extensive experiments,\neven when applied to pretrained ResNet18. Subsequently, we confirm the\napplicability of FedFN to foundation models.\n","authors":["Seongyoon Kim","Gihun Lee","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2311.13267v1.pdf","comment":"NeurIPS Workshop: \"Federated Learning in the Age of Foundation\n Models\" 2023"},{"id":"http://arxiv.org/abs/2311.13263v1","updated":"2023-11-22T09:27:46Z","published":"2023-11-22T09:27:46Z","title":"CMFDFormer: Transformer-based Copy-Move Forgery Detection with Continual\n Learning","summary":" Copy-move forgery detection aims at detecting duplicated regions in a\nsuspected forged image, and deep learning based copy-move forgery detection\nmethods are in the ascendant. These deep learning based methods heavily rely on\nsynthetic training data, and the performance will degrade when facing new\ntasks. In this paper, we propose a Transformer-style copy-move forgery\ndetection network named as CMFDFormer, and provide a novel PCSD (Pooled Cube\nand Strip Distillation) continual learning framework to help CMFDFormer handle\nnew tasks. CMFDFormer consists of a MiT (Mix Transformer) backbone network and\na PHD (Pluggable Hybrid Decoder) mask prediction network. The MiT backbone\nnetwork is a Transformer-style network which is adopted on the basis of\ncomprehensive analyses with CNN-style and MLP-style backbones. The PHD network\nis constructed based on self-correlation computation, hierarchical feature\nintegration, a multi-scale cycle fully-connected block and a mask\nreconstruction block. The PHD network is applicable to feature extractors of\ndifferent styles for hierarchical multi-scale information extraction, achieving\ncomparable performance. Last but not least, we propose a PCSD continual\nlearning framework to improve the forgery detectability and avoid catastrophic\nforgetting when handling new tasks. Our continual learning framework restricts\nintermediate features from the PHD network, and takes advantage of both cube\npooling and strip pooling. Extensive experiments on publicly available datasets\ndemonstrate the good performance of CMFDFormer and the effectiveness of the\nPCSD continual learning framework.\n","authors":["Yaqi Liu","Chao Xia","Song Xiao","Qingxiao Guan","Wenqian Dong","Yifan Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.13263v1.pdf","comment":"12pages,6 figures"},{"id":"http://arxiv.org/abs/2311.13261v1","updated":"2023-11-22T09:25:08Z","published":"2023-11-22T09:25:08Z","title":"Immunohistochemistry guided segmentation of benign epithelial cells, in\n situ lesions, and invasive epithelial cells in breast cancer slides","summary":" Digital pathology enables automatic analysis of histopathological sections\nusing artificial intelligence (AI). Automatic evaluation could improve\ndiagnostic efficiency and help find associations between morphological features\nand clinical outcome. For development of such prediction models, identifying\ninvasive epithelial cells, and separating these from benign epithelial cells\nand in situ lesions would be the first step. In this study, we aimed to develop\nan AI model for segmentation of epithelial cells in sections from breast\ncancer. We generated epithelial ground truth masks by restaining hematoxylin\nand eosin (HE) sections with cytokeratin (CK) AE1/AE3, and by pathologists'\nannotations. HE/CK image pairs were used to train a convolutional neural\nnetwork, and data augmentation was used to make the model more robust. Tissue\nmicroarrays (TMAs) from 839 patients, and whole slide images from two patients\nwere used for training and evaluation of the models. The sections were derived\nfrom four cohorts of breast cancer patients. TMAs from 21 patients from a fifth\ncohort was used as a second test set. In quantitative evaluation, a mean Dice\nscore of 0.70, 0.79, and 0.75 for invasive epithelial cells, benign epithelial\ncells, and in situ lesions, respectively, were achieved. In qualitative scoring\n(0-5) by pathologists, results were best for all epithelium and invasive\nepithelium, with scores of 4.7 and 4.4. Scores for benign epithelium and in\nsitu lesions were 3.7 and 2.0. The proposed model segmented epithelial cells in\nHE stained breast cancer slides well, but further work is needed for accurate\ndivision between the classes. Immunohistochemistry, together with pathologists'\nannotations, enabled the creation of accurate ground truths. The model is made\nfreely available in FastPathology and the code is available at\nhttps://github.com/AICAN-Research/breast-epithelium-segmentation\n","authors":["Maren Høibø","André Pedersen","Vibeke Grotnes Dale","Sissel Marie Berget","Borgny Ytterhus","Cecilia Lindskog","Elisabeth Wik","Lars A. Akslen","Ingerid Reinertsen","Erik Smistad","Marit Valla"],"pdf_url":"https://arxiv.org/pdf/2311.13261v1.pdf","comment":"19 pages, 6 figures. Submitted to a scientific journal"},{"id":"http://arxiv.org/abs/2311.13258v1","updated":"2023-11-22T09:23:34Z","published":"2023-11-22T09:23:34Z","title":"ViStruct: Visual Structural Knowledge Extraction via Curriculum Guided\n Code-Vision Representation","summary":" State-of-the-art vision-language models (VLMs) still have limited performance\nin structural knowledge extraction, such as relations between objects. In this\nwork, we present ViStruct, a training framework to learn VLMs for effective\nvisual structural knowledge extraction. Two novel designs are incorporated.\nFirst, we propose to leverage the inherent structure of programming language to\ndepict visual structural information. This approach enables explicit and\nconsistent representation of visual structural information of multiple\ngranularities, such as concepts, relations, and events, in a well-organized\nstructured format. Second, we introduce curriculum-based learning for VLMs to\nprogressively comprehend visual structures, from fundamental visual concepts to\nintricate event structures. Our intuition is that lower-level knowledge may\ncontribute to complex visual structure understanding. Furthermore, we compile\nand release a collection of datasets tailored for visual structural knowledge\nextraction. We adopt a weakly-supervised approach to directly generate visual\nevent structures from captions for ViStruct training, capitalizing on abundant\nimage-caption pairs from the web. In experiments, we evaluate ViStruct on\nvisual structure prediction tasks, demonstrating its effectiveness in improving\nthe understanding of visual structures. The code is public at\n\\url{https://github.com/Yangyi-Chen/vi-struct}.\n","authors":["Yangyi Chen","Xingyao Wang","Manling Li","Derek Hoiem","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2311.13258v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.13254v1","updated":"2023-11-22T09:18:49Z","published":"2023-11-22T09:18:49Z","title":"DA-STC: Domain Adaptive Video Semantic Segmentation via Spatio-Temporal\n Consistency","summary":" Video semantic segmentation is a pivotal aspect of video representation\nlearning. However, significant domain shifts present a challenge in effectively\nlearning invariant spatio-temporal features across the labeled source domain\nand unlabeled target domain for video semantic segmentation. To solve the\nchallenge, we propose a novel DA-STC method for domain adaptive video semantic\nsegmentation, which incorporates a bidirectional multi-level spatio-temporal\nfusion module and a category-aware spatio-temporal feature alignment module to\nfacilitate consistent learning for domain-invariant features. Firstly, we\nperform bidirectional spatio-temporal fusion at the image sequence level and\nshallow feature level, leading to the construction of two fused intermediate\nvideo domains. This prompts the video semantic segmentation model to\nconsistently learn spatio-temporal features of shared patch sequences which are\ninfluenced by domain-specific contexts, thereby mitigating the feature gap\nbetween the source and target domain. Secondly, we propose a category-aware\nfeature alignment module to promote the consistency of spatio-temporal\nfeatures, facilitating adaptation to the target domain. Specifically, we\nadaptively aggregate the domain-specific deep features of each category along\nspatio-temporal dimensions, which are further constrained to achieve\ncross-domain intra-class feature alignment and inter-class feature separation.\nExtensive experiments demonstrate the effectiveness of our method, which\nachieves state-of-the-art mIOUs on multiple challenging benchmarks.\nFurthermore, we extend the proposed DA-STC to the image domain, where it also\nexhibits superior performance for domain adaptive semantic segmentation. The\nsource code and models will be made available at\n\\url{https://github.com/ZHE-SAPI/DA-STC}.\n","authors":["Zhe Zhang","Gaochang Wu","Jing Zhang","Chunhua Shen","Dacheng Tao","Tianyou Chai"],"pdf_url":"https://arxiv.org/pdf/2311.13254v1.pdf","comment":"18 pages,9 figures"},{"id":"http://arxiv.org/abs/2311.13250v1","updated":"2023-11-22T09:12:50Z","published":"2023-11-22T09:12:50Z","title":"Towards Hetero-Client Federated Multi-Task Learning","summary":" Federated Learning (FL) enables joint training across distributed clients\nusing their local data privately. Federated Multi-Task Learning (FMTL) builds\non FL to handle multiple tasks, assuming model congruity that identical model\narchitecture is deployed in each client. To relax this assumption and thus\nextend real-world applicability, we introduce a novel problem setting,\nHetero-Client Federated Multi-Task Learning (HC-FMTL), to accommodate diverse\ntask setups. The main challenge of HC-FMTL is the model incongruity issue that\ninvalidates conventional aggregation methods. It also escalates the\ndifficulties in accurate model aggregation to deal with data and task\nheterogeneity inherent in FMTL. To address these challenges, we propose the\nFedHCA$^2$ framework, which allows for federated training of personalized\nmodels by modeling relationships among heterogeneous clients. Drawing on our\ntheoretical insights into the difference between multi-task and federated\noptimization, we propose the Hyper Conflict-Averse Aggregation scheme to\nmitigate conflicts during encoder updates. Additionally, inspired by task\ninteraction in MTL, the Hyper Cross Attention Aggregation scheme uses\nlayer-wise cross attention to enhance decoder interactions while alleviating\nmodel incongruity. Moreover, we employ learnable Hyper Aggregation Weights for\neach client to customize personalized parameter updates. Extensive experiments\ndemonstrate the superior performance of FedHCA$^2$ in various HC-FMTL scenarios\ncompared to representative methods. Our code will be made publicly available.\n","authors":["Yuxiang Lu","Suizhi Huang","Yuwen Yang","Shalayiding Sirejiding","Yue Ding","Hongtao Lu"],"pdf_url":"https://arxiv.org/pdf/2311.13250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12655v2","updated":"2023-11-22T09:00:02Z","published":"2023-11-21T14:57:24Z","title":"Hand-Eye Calibration","summary":" Whenever a sensor is mounted on a robot hand it is important to know the\nrelationship between the sensor and the hand. The problem of determining this\nrelationship is referred to as hand-eye calibration, which is important in at\nleast two types of tasks: (i) map sensor centered measurements into the robot\nworkspace and (ii) allow the robot to precisely move the sensor. In the past\nsome solutions were proposed in the particular case of a camera. With almost no\nexception, all existing solutions attempt to solve the homogeneous matrix\nequation AX=XB. First we show that there are two possible formulations of the\nhand-eye calibration problem. One formulation is the classical one that we just\nmentioned. A second formulation takes the form of the following homogeneous\nmatrix equation: MY=M'YB. The advantage of the latter is that the extrinsic and\nintrinsic camera parameters need not be made explicit. Indeed, this formulation\ndirectly uses the 3 by 4 perspective matrices (M and M') associated with two\npositions of the camera. Moreover, this formulation together with the classical\none cover a wider range of camera-based sensors to be calibrated with respect\nto the robot hand. Second, we develop a common mathematical framework to solve\nfor the hand-eye calibration problem using either of the two formulations. We\npresent two methods, (i) a rotation then translation and (ii) a non-linear\nsolver for rotation and translation. Third, we perform a stability analysis\nboth for our two methods and for the classical linear method of Tsai and Lenz\n(1989). In the light of this comparison, the non-linear optimization method,\nthat solves for rotation and translation simultaneously, seems to be the most\nrobust one with respect to noise and to measurement errors.\n","authors":["Radu Horaud","Fadi Dornaika"],"pdf_url":"https://arxiv.org/pdf/2311.12655v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15670v2","updated":"2023-11-22T08:49:44Z","published":"2023-06-27T17:59:46Z","title":"Symphonize 3D Semantic Scene Completion with Contextual Instance Queries","summary":" `3D Semantic Scene Completion (SSC) has emerged as a nascent and pivotal\nundertaking in autonomous driving, aiming to predict voxel occupancy within\nvolumetric scenes. However, prevailing methodologies primarily focus on\nvoxel-wise feature aggregation, while neglecting instance semantics and scene\ncontext. In this paper, we present a novel paradigm termed Symphonies\n(Scene-from-Insts), that delves into the integration of instance queries to\norchestrate 2D-to-3D reconstruction and 3D scene modeling. Leveraging our\nproposed Serial Instance-Propagated Attentions, Symphonies dynamically encodes\ninstance-centric semantics, facilitating intricate interactions between\nimage-based and volumetric domains. Simultaneously, Symphonies enables holistic\nscene comprehension by capturing context through the efficient fusion of\ninstance queries, alleviating geometric ambiguity such as occlusion and\nperspective errors through contextual scene reasoning. Experimental results\ndemonstrate that Symphonies achieves state-of-the-art performance on\nchallenging benchmarks SemanticKITTI and SSCBench-KITTI-360, yielding\nremarkable mIoU scores of 15.04 and 18.58, respectively. These results showcase\nthe paradigm's promising advancements. The code is available at\nhttps://github.com/hustvl/Symphonies.\n","authors":["Haoyi Jiang","Tianheng Cheng","Naiyu Gao","Haoyang Zhang","Tianwei Lin","Wenyu Liu","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2306.15670v2.pdf","comment":"Technical report. Code and models at:\n https://github.com/hustvl/Symphonies"},{"id":"http://arxiv.org/abs/2311.13234v1","updated":"2023-11-22T08:45:01Z","published":"2023-11-22T08:45:01Z","title":"TSegFormer: 3D Tooth Segmentation in Intraoral Scans with Geometry\n Guided Transformer","summary":" Optical Intraoral Scanners (IOS) are widely used in digital dentistry to\nprovide detailed 3D information of dental crowns and the gingiva. Accurate 3D\ntooth segmentation in IOSs is critical for various dental applications, while\nprevious methods are error-prone at complicated boundaries and exhibit\nunsatisfactory results across patients. In this paper, we propose TSegFormer\nwhich captures both local and global dependencies among different teeth and the\ngingiva in the IOS point clouds with a multi-task 3D transformer architecture.\nMoreover, we design a geometry-guided loss based on a novel point curvature to\nrefine boundaries in an end-to-end manner, avoiding time-consuming\npost-processing to reach clinically applicable segmentation. In addition, we\ncreate a dataset with 16,000 IOSs, the largest ever IOS dataset to the best of\nour knowledge. The experimental results demonstrate that our TSegFormer\nconsistently surpasses existing state-of-the-art baselines. The superiority of\nTSegFormer is corroborated by extensive analysis, visualizations and real-world\nclinical applicability tests. Our code is available at\nhttps://github.com/huiminxiong/TSegFormer.\n","authors":["Huimin Xiong","Kunle Li","Kaiyuan Tan","Yang Feng","Joey Tianyi Zhou","Jin Hao","Haochao Ying","Jian Wu","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.13234v1.pdf","comment":"MICCAI 2023, STAR(Student Travel) award. 11 pages, 3 figures, 5\n tables. arXiv admin note: text overlap with arXiv:2210.16627"},{"id":"http://arxiv.org/abs/2311.13231v1","updated":"2023-11-22T08:42:46Z","published":"2023-11-22T08:42:46Z","title":"Using Human Feedback to Fine-tune Diffusion Models without Any Reward\n Model","summary":" Using reinforcement learning with human feedback (RLHF) has shown significant\npromise in fine-tuning diffusion models. Previous methods start by training a\nreward model that aligns with human preferences, then leverage RL techniques to\nfine-tune the underlying models. However, crafting an efficient reward model\ndemands extensive datasets, optimal architecture, and manual hyperparameter\ntuning, making the process both time and cost-intensive. The direct preference\noptimization (DPO) method, effective in fine-tuning large language models,\neliminates the necessity for a reward model. However, the extensive GPU memory\nrequirement of the diffusion model's denoising process hinders the direct\napplication of the DPO method. To address this issue, we introduce the Direct\nPreference for Denoising Diffusion Policy Optimization (D3PO) method to\ndirectly fine-tune diffusion models. The theoretical analysis demonstrates that\nalthough D3PO omits training a reward model, it effectively functions as the\noptimal reward model trained using human feedback data to guide the learning\nprocess. This approach requires no training of a reward model, proving to be\nmore direct, cost-effective, and minimizing computational overhead. In\nexperiments, our method uses the relative scale of objectives as a proxy for\nhuman preference, delivering comparable results to methods using ground-truth\nrewards. Moreover, D3PO demonstrates the ability to reduce image distortion\nrates and generate safer images, overcoming challenges lacking robust reward\nmodels.\n","authors":["Kai Yang","Jian Tao","Jiafei Lyu","Chunjiang Ge","Jiaxin Chen","Qimai Li","Weihan Shen","Xiaolong Zhu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2311.13231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13222v1","updated":"2023-11-22T08:25:15Z","published":"2023-11-22T08:25:15Z","title":"Towards Detecting, Recognizing, and Parsing the Address Information from\n Bangla Signboard: A Deep Learning-based Approach","summary":" Retrieving textual information from natural scene images is an active\nresearch area in the field of computer vision with numerous practical\napplications. Detecting text regions and extracting text from signboards is a\nchallenging problem due to special characteristics like reflecting lights,\nuneven illumination, or shadows found in real-life natural scene images. With\nthe advent of deep learning-based methods, different sophisticated techniques\nhave been proposed for text detection and text recognition from the natural\nscene. Though a significant amount of effort has been devoted to extracting\nnatural scene text for resourceful languages like English, little has been done\nfor low-resource languages like Bangla. In this research work, we have proposed\nan end-to-end system with deep learning-based models for efficiently detecting,\nrecognizing, correcting, and parsing address information from Bangla\nsignboards. We have created manually annotated datasets and synthetic datasets\nto train signboard detection, address text detection, address text recognition,\naddress text correction, and address text parser models. We have conducted a\ncomparative study among different CTC-based and Encoder-Decoder model\narchitectures for Bangla address text recognition. Moreover, we have designed a\nnovel address text correction model using a sequence-to-sequence\ntransformer-based network to improve the performance of Bangla address text\nrecognition model by post-correction. Finally, we have developed a Bangla\naddress text parser using the state-of-the-art transformer-based pre-trained\nlanguage model.\n","authors":["Hasan Murad","Mohammed Eunus Ali"],"pdf_url":"https://arxiv.org/pdf/2311.13222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03943v2","updated":"2023-11-22T07:52:06Z","published":"2023-11-07T12:36:20Z","title":"CLIP Guided Image-perceptive Prompt Learning for Image Enhancement","summary":" Image enhancement is a significant research area in the fields of computer\nvision and image processing. In recent years, many learning-based methods for\nimage enhancement have been developed, where the Look-up-table (LUT) has proven\nto be an effective tool. In this paper, we delve into the potential of\nContrastive Language-Image Pre-Training (CLIP) Guided Prompt Learning,\nproposing a simple structure called CLIP-LUT for image enhancement. We found\nthat the prior knowledge of CLIP can effectively discern the quality of\ndegraded images, which can provide reliable guidance. To be specific, We\ninitially learn image-perceptive prompts to distinguish between original and\ntarget images using CLIP model, in the meanwhile, we introduce a very simple\nnetwork by incorporating a simple baseline to predict the weights of three\ndifferent LUT as enhancement network. The obtained prompts are used to steer\nthe enhancement network like a loss function and improve the performance of\nmodel. We demonstrate that by simply combining a straightforward method with\nCLIP, we can obtain satisfactory results.\n","authors":["Weiwen Chen","Qiuhong Ke","Zinuo Li"],"pdf_url":"https://arxiv.org/pdf/2311.03943v2.pdf","comment":"A trial work to the image enhancement"},{"id":"http://arxiv.org/abs/2311.13209v1","updated":"2023-11-22T07:47:39Z","published":"2023-11-22T07:47:39Z","title":"Test-time Adaptive Vision-and-Language Navigation","summary":" Vision-and-Language Navigation (VLN) has witnessed significant advancements\nin recent years, largely attributed to meticulously curated datasets and\nproficiently trained models. Nevertheless, when tested in diverse environments,\nthe trained models inevitably encounter significant shifts in data\ndistribution, highlighting that relying solely on pre-trained and fixed\nnavigation models is insufficient. To enhance models' generalization ability,\ntest-time adaptation (TTA) demonstrates significant potential in the computer\nvision field by leveraging unlabeled test samples for model updates. However,\nsimply applying existing TTA methods to the VLN task cannot well handle the\nadaptability-stability dilemma of VLN models, i.e., frequent updates can result\nin drastic changes in model parameters, while occasional updates can make the\nmodels ill-equipped to handle dynamically changing environments. Therefore, we\npropose a Fast-Slow Test-Time Adaptation (FSTTA) approach for VLN by performing\ndecomposition-accumulation analysis for both gradients and parameters in a\nunified framework. Specifically, in the fast update phase, gradients generated\nduring the recent multi-step navigation process are decomposed into components\nwith varying levels of consistency. Then, these components are adaptively\naccumulated to pinpoint a concordant direction for fast model adaptation. In\nthe slow update phase, historically recorded parameters are gathered, and a\nsimilar decomposition-accumulation analysis is conducted to revert the model to\na stable state. Extensive experiments show that our method obtains impressive\nperformance gains on four popular benchmarks.\n","authors":["Junyu Gao","Xuan Yao","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.13209v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2305.16213v2","updated":"2023-11-22T07:34:38Z","published":"2023-05-25T16:19:18Z","title":"ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with\n Variational Score Distillation","summary":" Score distillation sampling (SDS) has shown great promise in text-to-3D\ngeneration by distilling pretrained large-scale text-to-image diffusion models,\nbut suffers from over-saturation, over-smoothing, and low-diversity problems.\nIn this work, we propose to model the 3D parameter as a random variable instead\nof a constant as in SDS and present variational score distillation (VSD), a\nprincipled particle-based variational framework to explain and address the\naforementioned issues in text-to-3D generation. We show that SDS is a special\ncase of VSD and leads to poor samples with both small and large CFG weights. In\ncomparison, VSD works well with various CFG weights as ancestral sampling from\ndiffusion models and simultaneously improves the diversity and sample quality\nwith a common CFG weight (i.e., $7.5$). We further present various improvements\nin the design space for text-to-3D such as distillation time schedule and\ndensity initialization, which are orthogonal to the distillation algorithm yet\nnot well explored. Our overall approach, dubbed ProlificDreamer, can generate\nhigh rendering resolution (i.e., $512\\times512$) and high-fidelity NeRF with\nrich structure and complex effects (e.g., smoke and drops). Further,\ninitialized from NeRF, meshes fine-tuned by VSD are meticulously detailed and\nphoto-realistic. Project page and codes:\nhttps://ml.cs.tsinghua.edu.cn/prolificdreamer/\n","authors":["Zhengyi Wang","Cheng Lu","Yikai Wang","Fan Bao","Chongxuan Li","Hang Su","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.16213v2.pdf","comment":"NeurIPS 2023 (Spotlight)"},{"id":"http://arxiv.org/abs/2311.13200v1","updated":"2023-11-22T07:07:55Z","published":"2023-11-22T07:07:55Z","title":"Self-guided Few-shot Semantic Segmentation for Remote Sensing Imagery\n Based on Large Vision Models","summary":" The Segment Anything Model (SAM) exhibits remarkable versatility and\nzero-shot learning abilities, owing largely to its extensive training data\n(SA-1B). Recognizing SAM's dependency on manual guidance given its\ncategory-agnostic nature, we identified unexplored potential within few-shot\nsemantic segmentation tasks for remote sensing imagery. This research\nintroduces a structured framework designed for the automation of few-shot\nsemantic segmentation. It utilizes the SAM model and facilitates a more\nefficient generation of semantically discernible segmentation outcomes. Central\nto our methodology is a novel automatic prompt learning approach, leveraging\nprior guided masks to produce coarse pixel-wise prompts for SAM. Extensive\nexperiments on the DLRSD datasets underline the superiority of our approach,\noutperforming other available few-shot methodologies.\n","authors":["Xiyu Qi","Yifan Wu","Yongqiang Mao","Wenhui Zhang","Yidan Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13200v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13199v1","updated":"2023-11-22T07:06:38Z","published":"2023-11-22T07:06:38Z","title":"DRIFu: Differentiable Rendering and Implicit Function-based Single-View\n 3D Reconstruction","summary":" The Differentiable Rendering and Implicit Function-based model (DRIFu) draws\nits roots from the Pixel-aligned Implicit Function (PIFU), a pioneering 3D\ndigitization technique initially designed for clothed human bodies. PIFU excels\nin capturing nuanced body shape variations within a low-dimensional space and\nhas been extensively trained on human 3D scans. However, the application of\nPIFU to live animals poses significant challenges, primarily due to the\ninherent difficulty in obtaining the cooperation of animals for 3D scanning. In\nresponse to this challenge, we introduce the DRIFu model, specifically tailored\nfor animal digitization. To train DRIFu, we employ a curated set of synthetic\n3D animal models, encompassing diverse shapes, sizes, and even accounting for\nvariations such as baby birds. Our innovative alignment tools play a pivotal\nrole in mapping these diverse synthetic animal models onto a unified template,\nfacilitating precise predictions of animal shape and texture. Crucially, our\ntemplate alignment strategy establishes a shared shape space, allowing for the\nseamless sampling of new animal shapes, posing them realistically, animating\nthem, and aligning them with real-world data. This groundbreaking approach\nrevolutionizes our capacity to comprehensively understand and represent avian\nforms. For further details and access to the project, the project website can\nbe found at https://github.com/kuangzijian/drifu-for-animals\n","authors":["Zijian Kuang","Lihang Ying","Shi Jin"],"pdf_url":"https://arxiv.org/pdf/2311.13199v1.pdf","comment":"arXiv admin note: text overlap with arXiv:1905.05172 by other authors"},{"id":"http://arxiv.org/abs/2311.13198v1","updated":"2023-11-22T07:05:54Z","published":"2023-11-22T07:05:54Z","title":"DoubleAUG: Single-domain Generalized Object Detector in Urban via Color\n Perturbation and Dual-style Memory","summary":" Object detection in urban scenarios is crucial for autonomous driving in\nintelligent traffic systems. However, unlike conventional object detection\ntasks, urban-scene images vary greatly in style. For example, images taken on\nsunny days differ significantly from those taken on rainy days. Therefore,\nmodels trained on sunny day images may not generalize well to rainy day images.\nIn this paper, we aim to solve the single-domain generalizable object detection\ntask in urban scenarios, meaning that a model trained on images from one\nweather condition should be able to perform well on images from any other\nweather conditions. To address this challenge, we propose a novel Double\nAUGmentation (DoubleAUG) method that includes image- and feature-level\naugmentation schemes. In the image-level augmentation, we consider the\nvariation in color information across different weather conditions and propose\na Color Perturbation (CP) method that randomly exchanges the RGB channels to\ngenerate various images. In the feature-level augmentation, we propose to\nutilize a Dual-Style Memory (DSM) to explore the diverse style information on\nthe entire dataset, further enhancing the model's generalization capability.\nExtensive experiments demonstrate that our proposed method outperforms\nstate-of-the-art methods. Furthermore, ablation studies confirm the\neffectiveness of each module in our proposed method. Moreover, our method is\nplug-and-play and can be integrated into existing methods to further improve\nmodel performance.\n","authors":["Lei Qi","Peng Dong","Tan Xiong","Hui Xue","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2311.13198v1.pdf","comment":"Accepted by ACM Transactions on Multimedia Computing, Communications,\n and Applications"},{"id":"http://arxiv.org/abs/2311.13194v1","updated":"2023-11-22T06:46:37Z","published":"2023-11-22T06:46:37Z","title":"Towards Improving Document Understanding: An Exploration on\n Text-Grounding via MLLMs","summary":" In the field of document understanding, significant advances have been made\nin the fine-tuning of Multimodal Large Language Models (MLLMs) with\ninstruction-following data. Nevertheless, the potential of text-grounding\ncapability within text-rich scenarios remains underexplored. In this paper, we\npresent a text-grounding document understanding model, termed TGDoc, which\naddresses this deficiency by enhancing MLLMs with the ability to discern the\nspatial positioning of text within images. Empirical evidence suggests that\ntext-grounding improves the model's interpretation of textual content, thereby\nelevating its proficiency in comprehending text-rich images. Specifically, we\ncompile a dataset containing 99K PowerPoint presentations sourced from the\ninternet. We formulate instruction tuning tasks including text detection,\nrecognition, and spotting to facilitate the cohesive alignment between the\nvisual encoder and large language model. Moreover, we curate a collection of\ntext-rich images and prompt the text-only GPT-4 to generate 12K high-quality\nconversations, featuring textual locations within text-rich scenarios. By\nintegrating text location data into the instructions, TGDoc is adept at\ndiscerning text locations during the visual question process. Extensive\nexperiments demonstrate that our method achieves state-of-the-art performance\nacross multiple text-rich benchmarks, validating the effectiveness of our\nmethod.\n","authors":["Yonghui Wang","Wengang Zhou","Hao Feng","Keyi Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2311.13194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12198v2","updated":"2023-11-22T06:46:18Z","published":"2023-11-20T21:34:52Z","title":"PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics","summary":" We introduce PhysGaussian, a new method that seamlessly integrates physically\ngrounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel\nmotion synthesis. Employing a custom Material Point Method (MPM), our approach\nenriches 3D Gaussian kernels with physically meaningful kinematic deformation\nand mechanical stress attributes, all evolved in line with continuum mechanics\nprinciples. A defining characteristic of our method is the seamless integration\nbetween physical simulation and visual rendering: both components utilize the\nsame 3D Gaussian kernels as their discrete representations. This negates the\nnecessity for triangle/tetrahedron meshing, marching cubes, \"cage meshes,\" or\nany other geometry embedding, highlighting the principle of \"what you see is\nwhat you simulate (WS$^2$).\" Our method demonstrates exceptional versatility\nacross a wide variety of materials--including elastic entities, metals,\nnon-Newtonian fluids, and granular materials--showcasing its strong\ncapabilities in creating diverse visual content with novel viewpoints and\nmovements. Our project page is at: https://xpandora.github.io/PhysGaussian/\n","authors":["Tianyi Xie","Zeshun Zong","Yuxing Qiu","Xuan Li","Yutao Feng","Yin Yang","Chenfanfu Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.12198v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13187v1","updated":"2023-11-22T06:28:30Z","published":"2023-11-22T06:28:30Z","title":"NeISF: Neural Incident Stokes Field for Geometry and Material Estimation","summary":" Multi-view inverse rendering is the problem of estimating the scene\nparameters such as shapes, materials, or illuminations from a sequence of\nimages captured under different viewpoints. Many approaches, however, assume\nsingle light bounce and thus fail to recover challenging scenarios like\ninter-reflections. On the other hand, simply extending those methods to\nconsider multi-bounced light requires more assumptions to alleviate the\nambiguity. To address this problem, we propose Neural Incident Stokes Fields\n(NeISF), a multi-view inverse rendering framework that reduces ambiguities\nusing polarization cues. The primary motivation for using polarization cues is\nthat it is the accumulation of multi-bounced light, providing rich information\nabout geometry and material. Based on this knowledge, the proposed incident\nStokes field efficiently models the accumulated polarization effect with the\naid of an original physically-based differentiable polarimetric renderer.\nLastly, experimental results show that our method outperforms the existing\nworks in synthetic and real scenarios.\n","authors":["Chenhao Li","Taishi Ono","Takeshi Uemori","Hajime Mihara","Alexander Gatto","Hajime Nagahara","Yuseke Moriuchi"],"pdf_url":"https://arxiv.org/pdf/2311.13187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13186v1","updated":"2023-11-22T06:26:24Z","published":"2023-11-22T06:26:24Z","title":"Applications of Spiking Neural Networks in Visual Place Recognition","summary":" In robotics, Spiking Neural Networks (SNNs) are increasingly recognized for\ntheir largely-unrealized potential energy efficiency and low latency\nparticularly when implemented on neuromorphic hardware. Our paper highlights\nthree advancements for SNNs in Visual Place Recognition (VPR). First, we\npropose Modular SNNs, where each SNN represents a set of non-overlapping\ngeographically distinct places, enabling scalable networks for large\nenvironments. Secondly, we present Ensembles of Modular SNNs, where multiple\nnetworks represent the same place, significantly enhancing accuracy compared to\nsingle-network models. Our SNNs are compact and small, comprising only 1500\nneurons and 474k synapses, which makes them ideally suited for ensembling due\nto this small size. Lastly, we investigate the role of sequence matching in\nSNN-based VPR, a technique where consecutive images are used to refine place\nrecognition. We analyze the responsiveness of SNNs to ensembling and sequence\nmatching compared to other VPR techniques. Our contributions highlight the\nviability of SNNs for VPR, offering scalable and robust solutions, paving the\nway for their application in various energy-sensitive robotic tasks.\n","authors":["Somayeh Hussaini","Michael Milford","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2311.13186v1.pdf","comment":"17 pages, 8 figures, under review"},{"id":"http://arxiv.org/abs/2311.13182v1","updated":"2023-11-22T06:13:39Z","published":"2023-11-22T06:13:39Z","title":"Differentiable Radio Frequency Ray Tracing for Millimeter-Wave Sensing","summary":" Millimeter wave (mmWave) sensing is an emerging technology with applications\nin 3D object characterization and environment mapping. However, realizing\nprecise 3D reconstruction from sparse mmWave signals remains challenging.\nExisting methods rely on data-driven learning, constrained by dataset\navailability and difficulty in generalization. We propose DiffSBR, a\ndifferentiable framework for mmWave-based 3D reconstruction. DiffSBR\nincorporates a differentiable ray tracing engine to simulate radar point clouds\nfrom virtual 3D models. A gradient-based optimizer refines the model parameters\nto minimize the discrepancy between simulated and real point clouds.\nExperiments using various radar hardware validate DiffSBR's capability for\nfine-grained 3D reconstruction, even for novel objects unseen by the radar\npreviously. By integrating physics-based simulation with gradient optimization,\nDiffSBR transcends the limitations of data-driven approaches and pioneers a new\nparadigm for mmWave sensing.\n","authors":["Xingyu Chen","Xinyu Zhang","Qiyue Xia","Xinmin Fang","Chris Xiaoxuan Lu","Zhengxiong Li"],"pdf_url":"https://arxiv.org/pdf/2311.13182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12467v2","updated":"2023-11-22T06:01:46Z","published":"2023-11-21T09:27:30Z","title":"GLAD: Global-Local View Alignment and Background Debiasing for\n Unsupervised Video Domain Adaptation with Large Domain Gap","summary":" In this work, we tackle the challenging problem of unsupervised video domain\nadaptation (UVDA) for action recognition. We specifically focus on scenarios\nwith a substantial domain gap, in contrast to existing works primarily deal\nwith small domain gaps between labeled source domains and unlabeled target\ndomains. To establish a more realistic setting, we introduce a novel UVDA\nscenario, denoted as Kinetics->BABEL, with a more considerable domain gap in\nterms of both temporal dynamics and background shifts. To tackle the temporal\nshift, i.e., action duration difference between the source and target domains,\nwe propose a global-local view alignment approach. To mitigate the background\nshift, we propose to learn temporal order sensitive representations by temporal\norder learning and background invariant representations by background\naugmentation. We empirically validate that the proposed method shows\nsignificant improvement over the existing methods on the Kinetics->BABEL\ndataset with a large domain gap. The code is available at\nhttps://github.com/KHUVLL/GLAD.\n","authors":["Hyogun Lee","Kyungho Bae","Seong Jong Ha","Yumin Ko","Gyeong-Moon Park","Jinwoo Choi"],"pdf_url":"https://arxiv.org/pdf/2311.12467v2.pdf","comment":"This is an accepted WACV 2024 paper. Our code is available at\n https://github.com/KHUVLL/GLAD"},{"id":"http://arxiv.org/abs/2311.13177v1","updated":"2023-11-22T05:44:51Z","published":"2023-11-22T05:44:51Z","title":"Volumetric Reconstruction Resolves Off-Resonance Artifacts in Static and\n Dynamic PROPELLER MRI","summary":" Off-resonance artifacts in magnetic resonance imaging (MRI) are visual\ndistortions that occur when the actual resonant frequencies of spins within the\nimaging volume differ from the expected frequencies used to encode spatial\ninformation. These discrepancies can be caused by a variety of factors,\nincluding magnetic field inhomogeneities, chemical shifts, or susceptibility\ndifferences within the tissues. Such artifacts can manifest as blurring,\nghosting, or misregistration of the reconstructed image, and they often\ncompromise its diagnostic quality. We propose to resolve these artifacts by\nlifting the 2D MRI reconstruction problem to 3D, introducing an additional\n\"spectral\" dimension to model this off-resonance. Our approach is inspired by\nrecent progress in modeling radiance fields, and is capable of reconstructing\nboth static and dynamic MR images as well as separating fat and water, which is\nof independent clinical interest. We demonstrate our approach in the context of\nPROPELLER (Periodically Rotated Overlapping ParallEL Lines with Enhanced\nReconstruction) MRI acquisitions, which are popular for their robustness to\nmotion artifacts. Our method operates in a few minutes on a single GPU, and to\nour knowledge is the first to correct for chemical shift in gradient echo\nPROPELLER MRI reconstruction without additional measurements or pretraining\ndata.\n","authors":["Annesha Ghosh","Gordon Wetzstein","Mert Pilanci","Sara Fridovich-Keil"],"pdf_url":"https://arxiv.org/pdf/2311.13177v1.pdf","comment":"Code is available at\n https://github.com/sarafridov/volumetric-propeller"},{"id":"http://arxiv.org/abs/2311.13172v1","updated":"2023-11-22T05:31:06Z","published":"2023-11-22T05:31:06Z","title":"Learning to Complement with Multiple Humans (LECOMH): Integrating\n Multi-rater and Noisy-Label Learning into Human-AI Collaboration","summary":" The advent of learning with noisy labels (LNL), multi-rater learning, and\nhuman-AI collaboration has revolutionised the development of robust\nclassifiers, enabling them to address the challenges posed by different types\nof data imperfections and complex decision processes commonly encountered in\nreal-world applications. While each of these methodologies has individually\nmade significant strides in addressing their unique challenges, the development\nof techniques that can simultaneously tackle these three problems remains\nunderexplored. This paper addresses this research gap by integrating\nnoisy-label learning, multi-rater learning, and human-AI collaboration with new\nbenchmarks and the innovative Learning to Complement with Multiple Humans\n(LECOMH) approach. LECOMH optimises the level of human collaboration during\ntesting, aiming to optimise classification accuracy while minimising\ncollaboration costs that vary from 0 to M, where M is the maximum number of\nhuman collaborators. We quantitatively compare LECOMH with leading human-AI\ncollaboration methods using our proposed benchmarks. LECOMH consistently\noutperforms the competition, with accuracy improving as collaboration costs\nincrease. Notably, LECOMH is the only method enhancing human labeller\nperformance across all benchmarks.\n","authors":["Zheng Zhang","Kevin Wells","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2311.13172v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.13168v1","updated":"2023-11-22T05:24:35Z","published":"2023-11-22T05:24:35Z","title":"3D Face Style Transfer with a Hybrid Solution of NeRF and Mesh\n Rasterization","summary":" Style transfer for human face has been widely researched in recent years.\nMajority of the existing approaches work in 2D image domain and have 3D\ninconsistency issue when applied on different viewpoints of the same face. In\nthis paper, we tackle the problem of 3D face style transfer which aims at\ngenerating stylized novel views of a 3D human face with multi-view consistency.\nWe propose to use a neural radiance field (NeRF) to represent 3D human face and\ncombine it with 2D style transfer to stylize the 3D face. We find that directly\ntraining a NeRF on stylized images from 2D style transfer brings in 3D\ninconsistency issue and causes blurriness. On the other hand, training a NeRF\njointly with 2D style transfer objectives shows poor convergence due to the\nidentity and head pose gap between style image and content image. It also poses\nchallenge in training time and memory due to the need of volume rendering for\nfull image to apply style transfer loss functions. We therefore propose a\nhybrid framework of NeRF and mesh rasterization to combine the benefits of high\nfidelity geometry reconstruction of NeRF and fast rendering speed of mesh. Our\nframework consists of three stages: 1. Training a NeRF model on input face\nimages to learn the 3D geometry; 2. Extracting a mesh from the trained NeRF\nmodel and optimizing it with style transfer objectives via differentiable\nrasterization; 3. Training a new color network in NeRF conditioned on a style\nembedding to enable arbitrary style transfer to the 3D face. Experiment results\nshow that our approach generates high quality face style transfer with great 3D\nconsistency, while also enabling a flexible style control.\n","authors":["Jianwei Feng","Prateek Singhal"],"pdf_url":"https://arxiv.org/pdf/2311.13168v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.07647v3","updated":"2023-11-22T05:20:22Z","published":"2023-04-15T22:24:05Z","title":"LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene\n Graphs with Weak Supervision","summary":" We propose LASER, a neuro-symbolic approach to learn semantic video\nrepresentations that capture rich spatial and temporal properties in video data\nby leveraging high-level logic specifications. In particular, we formulate the\nproblem in terms of alignment between raw videos and spatio-temporal logic\nspecifications. The alignment algorithm leverages a differentiable symbolic\nreasoner and a combination of contrastive, temporal, and semantics losses. It\neffectively and efficiently trains low-level perception models to extract\nfine-grained video representation in the form of a spatio-temporal scene graph\nthat conforms to the desired high-level specification. In doing so, we explore\na novel methodology that weakly supervises the learning of video semantic\nrepresentations through logic specifications. We evaluate our method on two\ndatasets with rich spatial and temporal specifications:\n20BN-Something-Something and MUGEN. We demonstrate that our method learns\nbetter fine-grained video semantics than existing baselines.\n","authors":["Jiani Huang","Ziyang Li","Mayur Naik","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2304.07647v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08897v3","updated":"2023-11-22T05:15:38Z","published":"2023-10-13T06:58:52Z","title":"Self supervised convolutional kernel based handcrafted feature\n harmonization: Enhanced left ventricle hypertension disease phenotyping on\n echocardiography","summary":" Radiomics, a medical imaging technique, extracts quantitative handcrafted\nfeatures from images to predict diseases. Harmonization in those features\nensures consistent feature extraction across various imaging devices and\nprotocols. Methods for harmonization include standardized imaging protocols,\nstatistical adjustments, and evaluating feature robustness. Myocardial diseases\nsuch as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD)\nare diagnosed via echocardiography, but variable imaging settings pose\nchallenges. Harmonization techniques are crucial for applying handcrafted\nfeatures in disease diagnosis in such scenario. Self-supervised learning (SSL)\nenhances data understanding within limited datasets and adapts to diverse data\nsettings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying\nsuperior performance in various tasks. This study focuses on convolutional\nfilters within SSL, using them as preprocessing to convert images into feature\nmaps for handcrafted feature harmonization. Our proposed method excelled in\nharmonization evaluation and exhibited superior LVH classification performance\ncompared to existing methods.\n","authors":["Jina Lee","Youngtaek Hong","Dawun Jeong","Yeonggul Jang","Jaeik Jeon","Sihyeon Jeong","Taekgeun Jung","Yeonyee E. Yoon","Inki Moon","Seung-Ah Lee","Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2310.08897v3.pdf","comment":"11 pages, 7 figures"},{"id":"http://arxiv.org/abs/2309.05254v2","updated":"2023-11-22T05:00:56Z","published":"2023-09-11T06:18:05Z","title":"Towards Better Data Exploitation in Self-Supervised Monocular Depth\n Estimation","summary":" Depth estimation plays an important role in the robotic perception system.\nSelf-supervised monocular paradigm has gained significant attention since it\ncan free training from the reliance on depth annotations. Despite recent\nadvancements, existing self-supervised methods still underutilize the available\ntraining data, limiting their generalization ability. In this paper, we take\ntwo data augmentation techniques, namely Resizing-Cropping and\nSplitting-Permuting, to fully exploit the potential of training datasets.\nSpecifically, the original image and the generated two augmented images are fed\ninto the training pipeline simultaneously and we leverage them to conduct\nself-distillation. Additionally, we introduce the detail-enhanced DepthNet with\nan extra full-scale branch in the encoder and a grid decoder to enhance the\nrestoration of fine details in depth maps. Experimental results demonstrate our\nmethod can achieve state-of-the-art performance on the KITTI benchmark, with\nboth raw ground truth and improved ground truth. Moreover, our models also show\nsuperior generalization performance when transferring to Make3D and NYUv2\ndatasets. Our codes are available at https://github.com/Sauf4896/BDEdepth.\n","authors":["Jinfeng Liu","Lingtong Kong","Jie Yang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2309.05254v2.pdf","comment":"8 pages, 6 figures, accepted by IEEE Robotics and Automation Letters\n (RA-L, 2023)"},{"id":"http://arxiv.org/abs/2311.13152v1","updated":"2023-11-22T04:31:09Z","published":"2023-11-22T04:31:09Z","title":"Test-Time Augmentation for 3D Point Cloud Classification and\n Segmentation","summary":" Data augmentation is a powerful technique to enhance the performance of a\ndeep learning task but has received less attention in 3D deep learning. It is\nwell known that when 3D shapes are sparsely represented with low point density,\nthe performance of the downstream tasks drops significantly. This work explores\ntest-time augmentation (TTA) for 3D point clouds. We are inspired by the recent\nrevolution of learning implicit representation and point cloud upsampling,\nwhich can produce high-quality 3D surface reconstruction and\nproximity-to-surface, respectively. Our idea is to leverage the implicit field\nreconstruction or point cloud upsampling techniques as a systematic way to\naugment point cloud data. Mainly, we test both strategies by sampling points\nfrom the reconstructed results and using the sampled point cloud as test-time\naugmented data. We show that both strategies are effective in improving\naccuracy. We observed that point cloud upsampling for test-time augmentation\ncan lead to more significant performance improvement on downstream tasks such\nas object classification and segmentation on the ModelNet40, ShapeNet,\nScanObjectNN, and SemanticKITTI datasets, especially for sparse point clouds.\n","authors":["Tuan-Anh Vu","Srinjay Sarkar","Zhiyuan Zhang","Binh-Son Hua","Sai-Kit Yeung"],"pdf_url":"https://arxiv.org/pdf/2311.13152v1.pdf","comment":"This paper is accepted in 3DV 2024"},{"id":"http://arxiv.org/abs/2308.09936v2","updated":"2023-11-22T04:29:33Z","published":"2023-08-19T07:53:43Z","title":"BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual\n Questions","summary":" Vision Language Models (VLMs), which extend Large Language Models (LLM) by\nincorporating visual understanding capability, have demonstrated significant\nadvancements in addressing open-ended visual question-answering (VQA) tasks.\nHowever, these models cannot accurately interpret images infused with text, a\ncommon occurrence in real-world scenarios. Standard procedures for extracting\ninformation from images often involve learning a fixed set of query embeddings.\nThese embeddings are designed to encapsulate image contexts and are later used\nas soft prompt inputs in LLMs. Yet, this process is limited to the token count,\npotentially curtailing the recognition of scenes with text-rich context. To\nimprove upon them, the present study introduces BLIVA: an augmented version of\nInstructBLIP with Visual Assistant. BLIVA incorporates the query embeddings\nfrom InstructBLIP and also directly projects encoded patch embeddings into the\nLLM, a technique inspired by LLaVA. This approach assists the model to capture\nintricate details potentially missed during the query decoding process.\nEmpirical evidence demonstrates that our model, BLIVA, significantly enhances\nperformance in processing text-rich VQA benchmarks (up to 17.76% in OCR-VQA\nbenchmark) and in undertaking general (not particularly text-rich) VQA\nbenchmarks (up to 7.9% in Visual Spatial Reasoning benchmark), comparing to our\nbaseline InstructBLIP. BLIVA demonstrates significant capability in decoding\nreal-world images, irrespective of text presence. To demonstrate the broad\nindustry applications enabled by BLIVA, we evaluate the model using a new\ndataset comprising YouTube thumbnails paired with question-answer sets across\n11 diverse categories. For researchers interested in further exploration, our\ncode and models are freely accessible at https://github.com/mlpc-ucsd/BLIVA.\n","authors":["Wenbo Hu","Yifan Xu","Yi Li","Weiyue Li","Zeyuan Chen","Zhuowen Tu"],"pdf_url":"https://arxiv.org/pdf/2308.09936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13144v1","updated":"2023-11-22T04:14:42Z","published":"2023-11-22T04:14:42Z","title":"Single Image Compressed Sensing MRI via a Self-Supervised Deep Denoising\n Approach","summary":" Popular methods in compressed sensing (CS) are dependent on deep learning\n(DL), where large amounts of data are used to train non-linear reconstruction\nmodels. However, ensuring generalisability over and access to multiple datasets\nis challenging to realise for real-world applications. To address these\nconcerns, this paper proposes a single image, self-supervised (SS) CS-MRI\nframework that enables a joint deep and sparse regularisation of CS artefacts.\nThe approach effectively dampens structured CS artefacts, which can be\ndifficult to remove assuming sparse reconstruction, or relying solely on the\ninductive biases of CNN to produce noise-free images. Image quality is thereby\nimproved compared to either approach alone. Metrics are evaluated using\nCartesian 1D masks on a brain and knee dataset, with PSNR improving by 2-4dB on\naverage.\n","authors":["Marlon Bran Lorenzana","Feng Liu","Shekhar S. Chandra"],"pdf_url":"https://arxiv.org/pdf/2311.13144v1.pdf","comment":"5 pages, 4 figures, 2 tables, conference"},{"id":"http://arxiv.org/abs/2311.12068v2","updated":"2023-11-22T04:13:38Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":" In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at\nhttps://github.com/rohit901/cooperative-foundational-models .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v2.pdf","comment":"Code: https://github.com/rohit901/cooperative-foundational-models"},{"id":"http://arxiv.org/abs/2311.13141v1","updated":"2023-11-22T04:06:39Z","published":"2023-11-22T04:06:39Z","title":"Diffusion360: Seamless 360 Degree Panoramic Image Generation based on\n Diffusion Models","summary":" This is a technical report on the 360-degree panoramic image generation task\nbased on diffusion models. Unlike ordinary 2D images, 360-degree panoramic\nimages capture the entire $360^\\circ\\times 180^\\circ$ field of view. So the\nrightmost and the leftmost sides of the 360 panoramic image should be\ncontinued, which is the main challenge in this field. However, the current\ndiffusion pipeline is not appropriate for generating such a seamless 360-degree\npanoramic image. To this end, we propose a circular blending strategy on both\nthe denoising and VAE decoding stages to maintain the geometry continuity.\nBased on this, we present two models for \\textbf{Text-to-360-panoramas} and\n\\textbf{Single-Image-to-360-panoramas} tasks. The code has been released as an\nopen-source project at\n\\href{https://github.com/ArcherFMY/SD-T2I-360PanoImage}{https://github.com/ArcherFMY/SD-T2I-360PanoImage}\nand\n\\href{https://www.modelscope.cn/models/damo/cv_diffusion_text-to-360panorama-image_generation/summary}{ModelScope}\n","authors":["Mengyang Feng","Jinlin Liu","Miaomiao Cui","Xuansong Xie"],"pdf_url":"https://arxiv.org/pdf/2311.13141v1.pdf","comment":"2 pages, 8 figures, Tech. Report"},{"id":"http://arxiv.org/abs/2311.13134v1","updated":"2023-11-22T03:41:13Z","published":"2023-11-22T03:41:13Z","title":"Lightweight High-Speed Photography Built on Coded Exposure and Implicit\n Neural Representation of Videos","summary":" The compact cameras recording high-speed scenes with high resolution are\nhighly demanded, but the required high bandwidth often leads to bulky, heavy\nsystems, which limits their applications on low-capacity platforms. Adopting a\ncoded exposure setup to encode a frame sequence into a blurry snapshot and\nretrieve the latent sharp video afterward can serve as a lightweight solution.\nHowever, restoring motion from blur is quite challenging due to the high\nill-posedness of motion blur decomposition, intrinsic ambiguity in motion\ndirection, and diverse motions in natural videos. In this work, by leveraging\nclassical coded exposure imaging technique and emerging implicit neural\nrepresentation for videos, we tactfully embed the motion direction cues into\nthe blurry image during the imaging process and develop a novel self-recursive\nneural network to sequentially retrieve the latent video sequence from the\nblurry image utilizing the embedded motion direction cues. To validate the\neffectiveness and efficiency of the proposed framework, we conduct extensive\nexperiments on benchmark datasets and real-captured blurry images. The results\ndemonstrate that our proposed framework significantly outperforms existing\nmethods in quality and flexibility. The code for our work is available at\nhttps://github.com/zhihongz/BDINR\n","authors":["Zhihong Zhang","Runzhao Yang","Jinli Suo","Yuxiao Cheng","Qionghai Dai"],"pdf_url":"https://arxiv.org/pdf/2311.13134v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2309.08273v2","updated":"2023-11-22T03:36:22Z","published":"2023-09-15T09:34:05Z","title":"Unsupervised Disentangling of Facial Representations with 3D-aware\n Latent Diffusion Models","summary":" Unsupervised learning of facial representations has gained increasing\nattention for face understanding ability without heavily relying on large-scale\nannotated datasets. However, it remains unsolved due to the coupling of facial\nidentities, expressions, and external factors like pose and light. Prior\nmethods primarily focus on 2D factors and pixel-level consistency, leading to\nincomplete disentangling and suboptimal performance in downstream tasks. In\nthis paper, we propose LatentFace, a novel unsupervised disentangling framework\nfor facial expression and identity representation. We suggest the disentangling\nproblem should be performed in latent space and propose the solution using a\n3D-aware latent diffusion model. First, we introduce a 3D-aware autoencoder to\nencode face images into 3D latent embeddings. Second, we propose a novel\nrepresentation diffusion model (RDM) to disentangle 3D latent into facial\nidentity and expression. Consequently, our method achieves state-of-the-art\nperformance in facial expression recognition and face verification among\nunsupervised facial representation learning models. Codes are available at\n\\url{https://github.com/ryanhe312/LatentFace}.\n","authors":["Ruian He","Zhen Xing","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2309.08273v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13128v1","updated":"2023-11-22T03:33:00Z","published":"2023-11-22T03:33:00Z","title":"P2RBox: A Single Point is All You Need for Oriented Object Detection","summary":" Oriented object detection, a specialized subfield in computer vision, finds\napplications across diverse scenarios, excelling particularly when dealing with\nobjects of arbitrary orientations. Conversely, point annotation, which treats\nobjects as single points, offers a cost-effective alternative to rotated and\nhorizontal bounding boxes but sacrifices performance due to the loss of size\nand orientation information. In this study, we introduce the P2RBox network,\nwhich leverages point annotations and a mask generator to create mask\nproposals, followed by filtration through our Inspector Module and Constrainer\nModule. This process selects high-quality masks, which are subsequently\nconverted into rotated box annotations for training a fully supervised\ndetector. Specifically, we've thoughtfully crafted an Inspector Module rooted\nin multi-instance learning principles to evaluate the semantic score of masks.\nWe've also proposed a more robust mask quality assessment in conjunction with\nthe Constrainer Module. Furthermore, we've introduced a Symmetry Axis\nEstimation (SAE) Module inspired by the spectral theorem for symmetric matrices\nto transform the top-performing mask proposal into rotated bounding boxes.\nP2RBox performs well with three fully supervised rotated object detectors:\nRetinaNet, Rotated FCOS, and Oriented R-CNN. By combining with Oriented R-CNN,\nP2RBox achieves 62.26% on DOTA-v1.0 test dataset. As far as we know, this is\nthe first attempt at training an oriented object detector with point\nsupervision.\n","authors":["Guangming Cao","Xuehui Yu","Wenwen Yu","Xumeng Han","Xue Yang","Guorong Li","Jianbin Jiao","Zhenjun Han"],"pdf_url":"https://arxiv.org/pdf/2311.13128v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13127v1","updated":"2023-11-22T03:31:31Z","published":"2023-11-22T03:31:31Z","title":"Toward Robust Imperceptible Perturbation against Unauthorized\n Text-to-image Diffusion-based Synthesis","summary":" Text-to-image diffusion models allow seamless generation of personalized\nimages from scant reference photos. Yet, these tools, in the wrong hands, can\nfabricate misleading or harmful content, endangering individuals. To address\nthis problem, existing poisoning-based approaches perturb user images in an\nimperceptible way to render them \"unlearnable\" from malicious uses. We identify\ntwo limitations of these defending approaches: i) sub-optimal due to the\nhand-crafted heuristics for solving the intractable bilevel optimization and\nii) lack of robustness against simple data transformations like Gaussian\nfiltering. To solve these challenges, we propose MetaCloak, which solves the\nbi-level poisoning problem with a meta-learning framework with an additional\ntransformation sampling process to craft transferable and robust perturbation.\nSpecifically, we employ a pool of surrogate diffusion models to craft\ntransferable and model-agnostic perturbation. Furthermore, by incorporating an\nadditional transformation process, we design a simple denoising-error\nmaximization loss that is sufficient for causing transformation-robust semantic\ndistortion and degradation in a personalized generation. Extensive experiments\non the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing\napproaches. Notably, MetaCloak can successfully fool online training services\nlike Replicate, in a black-box manner, demonstrating the effectiveness of\nMetaCloak in real-world scenarios. Our code is available at\nhttps://github.com/liuyixin-louis/MetaCloak.\n","authors":["Yixin Liu","Chenrui Fan","Yutong Dai","Xun Chen","Pan Zhou","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13127v1.pdf","comment":"26 pages, 15 figures, 8 tables"},{"id":"http://arxiv.org/abs/2311.13125v1","updated":"2023-11-22T03:26:07Z","published":"2023-11-22T03:26:07Z","title":"DAE-Net: Deforming Auto-Encoder for fine-grained shape co-segmentation","summary":" We present an unsupervised 3D shape co-segmentation method which learns a set\nof deformable part templates from a shape collection. To accommodate structural\nvariations in the collection, our network composes each shape by a selected\nsubset of template parts which are affine-transformed. To maximize the\nexpressive power of the part templates, we introduce a per-part deformation\nnetwork to enable the modeling of diverse parts with substantial geometry\nvariations, while imposing constraints on the deformation capacity to ensure\nfidelity to the originally represented parts. We also propose a training scheme\nto effectively overcome local minima. Architecturally, our network is a\nbranched autoencoder, with a CNN encoder taking a voxel shape as input and\nproducing per-part transformation matrices, latent codes, and part existence\nscores, and the decoder outputting point occupancies to define the\nreconstruction loss. Our network, coined DAE-Net for Deforming Auto-Encoder,\ncan achieve unsupervised 3D shape co-segmentation that yields fine-grained,\ncompact, and meaningful parts that are consistent across diverse shapes. We\nconduct extensive experiments on the ShapeNet Part dataset, DFAUST, and an\nanimal subset of Objaverse to show superior performance over prior methods.\n","authors":["Zhiqin Chen","Qimin Chen","Hang Zhou","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13125v1.pdf","comment":"Code: https://github.com/czq142857/DAE-Net"},{"id":"http://arxiv.org/abs/2311.12161v2","updated":"2023-11-22T03:23:17Z","published":"2023-11-20T20:27:42Z","title":"ChemScraper: Graphics Extraction, Molecular Diagram Parsing, and\n Annotated Data Generation for PDF Images","summary":" Existing visual parsers for molecule diagrams translate pixel-based raster\nimages such as PNGs to chemical structure representations (e.g., SMILES).\nHowever, PDFs created by word processors including LaTeX and Word provide\nexplicit locations and shapes for characters, lines, and polygons. We extract\nsymbols from born-digital PDF molecule images and then apply simple graph\ntransformations to capture both visual and chemical structure in editable\nChemDraw files (CDXML). Our fast ( PDF $\\rightarrow$ visual graph $\\rightarrow$\nchemical graph ) pipeline does not require GPUs, Optical Character Recognition\n(OCR) or vectorization. We evaluate on standard benchmarks using SMILES\nstrings, along with a novel evaluation that provides graph-based metrics and\nerror compilation using LgEval. The geometric information in born-digital PDFs\nproduces a highly accurate parser, motivating generating training data for\nvisual parsers that recognize from raster images, with extracted graphics,\nvisual structure, and chemical structure as annotations. To do this we render\nSMILES strings in Indigo, parse molecule structure, and then validate\nrecognized structure to select correct files.\n","authors":["Ayush Kumar Shah","Bryan Manrique Amador","Abhisek Dey","Ming Creekmore","Blake Ocampo","Scott Denmark","Richard Zanibbi"],"pdf_url":"https://arxiv.org/pdf/2311.12161v2.pdf","comment":"20 pages without references, 10 figures, 3 Tables, submitted to\n International Journal on Document Analysis and Recognition (IJDAR)"},{"id":"http://arxiv.org/abs/2306.04889v2","updated":"2023-11-22T03:02:46Z","published":"2023-06-08T02:35:30Z","title":"ShaDDR: Interactive Example-Based Geometry and Texture Generation via 3D\n Shape Detailization and Differentiable Rendering","summary":" We present ShaDDR, an example-based deep generative neural network which\nproduces a high-resolution textured 3D shape through geometry detailization and\nconditional texture generation applied to an input coarse voxel shape. Trained\non a small set of detailed and textured exemplar shapes, our method learns to\ndetailize the geometry via multi-resolution voxel upsampling and generate\ntextures on voxel surfaces via differentiable rendering against exemplar\ntexture images from a few views. The generation is interactive, taking less\nthan 1 second to produce a 3D model with voxel resolutions up to 512^3. The\ngenerated shape preserves the overall structure of the input coarse voxel\nmodel, while the style of the generated geometric details and textures can be\nmanipulated through learned latent codes. In the experiments, we show that our\nmethod can generate higher-resolution shapes with plausible and improved\ngeometric details and clean textures compared to prior works. Furthermore, we\nshowcase the ability of our method to learn geometric details and textures from\nshapes reconstructed from real-world photos. In addition, we have developed an\ninteractive modeling application to demonstrate the generalizability of our\nmethod to various user inputs and the controllability it offers, allowing users\nto interactively sculpt a coarse voxel shape to define the overall structure of\nthe detailized 3D shape. Code and data are available at\nhttps://github.com/qiminchen/ShaDDR.\n","authors":["Qimin Chen","Zhiqin Chen","Hang Zhou","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.04889v2.pdf","comment":"Accepted to SIGGRAPH Asia 2023 conference track. Code:\n https://github.com/qiminchen/ShaDDR"},{"id":"http://arxiv.org/abs/2311.13120v1","updated":"2023-11-22T02:46:57Z","published":"2023-11-22T02:46:57Z","title":"Multi-modal In-Context Learning Makes an Ego-evolving Scene Text\n Recognizer","summary":" Scene text recognition (STR) in the wild frequently encounters challenges\nwhen coping with domain variations, font diversity, shape deformations, etc. A\nstraightforward solution is performing model fine-tuning tailored to a specific\nscenario, but it is computationally intensive and requires multiple model\ncopies for various scenarios. Recent studies indicate that large language\nmodels (LLMs) can learn from a few demonstration examples in a training-free\nmanner, termed \"In-Context Learning\" (ICL). Nevertheless, applying LLMs as a\ntext recognizer is unacceptably resource-consuming. Moreover, our pilot\nexperiments on LLMs show that ICL fails in STR, mainly attributed to the\ninsufficient incorporation of contextual information from diverse samples in\nthe training stage. To this end, we introduce E$^2$STR, a STR model trained\nwith context-rich scene text sequences, where the sequences are generated via\nour proposed in-context training strategy. E$^2$STR demonstrates that a\nregular-sized model is sufficient to achieve effective ICL capabilities in STR.\nExtensive experiments show that E$^2$STR exhibits remarkable training-free\nadaptation in various scenarios and outperforms even the fine-tuned\nstate-of-the-art approaches on public benchmarks.\n","authors":["Zhen Zhao","Can Huang","Binghong Wu","Chunhui Lin","Hao Liu","Zhizhong Zhang","Xin Tan","Jingqun Tang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2311.13120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08326v2","updated":"2023-11-22T02:43:01Z","published":"2023-06-14T07:58:14Z","title":"Early Detection of Late Blight Tomato Disease using Histogram Oriented\n Gradient based Support Vector Machine","summary":" The tomato is one of the most important fruits on earth. It plays an\nimportant and useful role in the agricultural production of any country. This\nresearch propose a novel smart technique for early detection of late blight\ndiseases in tomatoes. This work improve the dataset with an increase in images\nfrom the field (the Plant Village dataset) and proposed a hybrid algorithm\ncomposed of support vector machines (SVM) and histogram-oriented gradients\n(HOG) for real-time detection of late blight tomato disease. To propose a\nHOG-based SVM model for early detection of late blight tomato leaf disease. To\ncheck the performance of the proposed model in terms of MSE, accuracy,\nprecision, and recall as compared to Decision Tree and KNN. The integration of\nadvanced technology in agriculture has the potential to revolutionize the\nindustry, making it more efficient, sustainable, and profitable. This research\nwork on the early detection of tomato diseases contributes to the growing\nimportance of smart farming, the need for climate-smart agriculture, the rising\nneed to more efficiently utilize natural resources, and the demand for higher\ncrop yields. The proposed hybrid algorithm of SVM and HOG has significant\npotential for the early detection of late blight disease in tomato plants. The\nperformance of the proposed model against decision tree and KNN algorithms and\nthe results may assist in selecting the best algorithm for future applications.\nThe research work can help farmers make data-driven decisions to optimize crop\nyield and quality while also reducing the environmental impact of farming\npractices.\n","authors":["M. Ishaq","M. Waqas"],"pdf_url":"https://arxiv.org/pdf/2306.08326v2.pdf","comment":"The article titled \"Early Detection of Late Blight Tomato Disease\n using Histogram Oriented Gradient based Support Vector Machine\" need to be\n withdrawn there are other contributors in the improvement of this article"},{"id":"http://arxiv.org/abs/2311.13110v1","updated":"2023-11-22T02:23:32Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v1.pdf","comment":"This paper integrates the works arXiv:2306.01129 and\n arXiv:2308.16271, as well as this under-review work:\n https://openreview.net/forum?id=PvyOYleymy into a complete story. In this\n paper, we improve the writing and organization, and also add conceptual,\n empirical, and theoretical improvements over the previous work"},{"id":"http://arxiv.org/abs/2311.12144v2","updated":"2023-11-22T02:19:41Z","published":"2023-11-20T19:45:27Z","title":"Applications of Large Scale Foundation Models for Autonomous Driving","summary":" Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007,\nautonomous driving has been the most active field of AI applications. Recently\npowered by large language models (LLMs), chat systems, such as chatGPT and\nPaLM, emerge and rapidly become a promising direction to achieve artificial\ngeneral intelligence (AGI) in natural language processing (NLP). There comes a\nnatural thinking that we could employ these abilities to reformulate autonomous\ndriving. By combining LLM with foundation models, it is possible to utilize the\nhuman knowledge, commonsense and reasoning to rebuild autonomous driving\nsystems from the current long-tailed AI dilemma. In this paper, we investigate\nthe techniques of foundation models and LLMs applied for autonomous driving,\ncategorized as simulation, world model, data annotation and planning or E2E\nsolutions etc.\n","authors":["Yu Huang","Yue Chen","Zhu Li"],"pdf_url":"https://arxiv.org/pdf/2311.12144v2.pdf","comment":"23 pages. arXiv admin note: text overlap with arXiv:2304.03589,\n arXiv:2111.05849, arXiv:2306.03000, arXiv:2301.02691, arXiv:2309.16292,\n arXiv:2309.17080, arXiv:2309.10228, arXiv:2310.01415 by other authors"},{"id":"http://arxiv.org/abs/2309.00168v2","updated":"2023-11-22T02:16:48Z","published":"2023-08-31T23:17:44Z","title":"Pose-Graph Attentional Graph Neural Network for Lidar Place Recognition","summary":" This paper proposes a pose-graph attentional graph neural network, called\nP-GAT, which compares (key)nodes between sequential and non-sequential\nsub-graphs for place recognition tasks as opposed to a common frame-to-frame\nretrieval problem formulation currently implemented in SOTA place recognition\nmethods. P-GAT uses the maximum spatial and temporal information between\nneighbour cloud descriptors -- generated by an existing encoder -- utilising\nthe concept of pose-graph SLAM. Leveraging intra- and inter-attention and graph\nneural network, P-GAT relates point clouds captured in nearby locations in\nEuclidean space and their embeddings in feature space. Experimental results on\nthe large-scale publically available datasets demonstrate the effectiveness of\nour approach in scenes lacking distinct features and when training and testing\nenvironments have different distributions (domain adaptation). Further, an\nexhaustive comparison with the state-of-the-art shows improvements in\nperformance gains. Code is available at\nhttps://github.com/csiro-robotics/P-GAT.\n","authors":["Milad Ramezani","Liang Wang","Joshua Knights","Zhibin Li","Pauline Pounds","Peyman Moghadam"],"pdf_url":"https://arxiv.org/pdf/2309.00168v2.pdf","comment":"10 pages, 5 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.12603v2","updated":"2023-11-22T02:15:51Z","published":"2023-11-21T13:43:16Z","title":"Surgical Temporal Action-aware Network with Sequence Regularization for\n Phase Recognition","summary":" To assist surgeons in the operating theatre, surgical phase recognition is\ncritical for developing computer-assisted surgical systems, which requires\ncomprehensive understanding of surgical videos. Although existing studies made\ngreat progress, there are still two significant limitations worthy of\nimprovement. First, due to the compromise of resource consumption, frame-wise\nvisual features are extracted by 2D networks and disregard spatial and temporal\nknowledge of surgical actions, which hinders subsequent inter-frame modeling\nfor phase prediction. Second, these works simply utilize ordinary\nclassification loss with one-hot phase labels to optimize the phase\npredictions, and cannot fully explore surgical videos under inadequate\nsupervision. To overcome these two limitations, we propose a Surgical Temporal\nAction-aware Network with sequence Regularization, named STAR-Net, to recognize\nsurgical phases more accurately from input videos. Specifically, we propose an\nefficient multi-scale surgical temporal action (MS-STA) module, which\nintegrates visual features with spatial and temporal knowledge of surgical\nactions at the cost of 2D networks. Moreover, we devise the dual-classifier\nsequence regularization (DSR) to facilitate the training of STAR-Net by the\nsequence guidance of an auxiliary classifier with a smaller capacity. Our\nSTAR-Net with MS-STA and DSR can exploit visual features of surgical actions\nwith effective regularization, thereby leading to the superior performance of\nsurgical phase recognition. Extensive experiments on a large-scale gastrectomy\nsurgery dataset and the public Cholec80 benchmark prove that our STAR-Net\nsignificantly outperforms state-of-the-arts of surgical phase recognition.\n","authors":["Zhen Chen","Yuhao Zhai","Jun Zhang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12603v2.pdf","comment":"Accepted by 2023 IEEE International Conference on Bioinformatics and\n Biomedicine (BIBM 2023)"},{"id":"http://arxiv.org/abs/2311.13100v1","updated":"2023-11-22T01:59:19Z","published":"2023-11-22T01:59:19Z","title":"Automated Measurement of Pericoronary Adipose Tissue Attenuation and\n Volume in CT Angiography","summary":" Pericoronary adipose tissue (PCAT) is the deposition of fat in the vicinity\nof the coronary arteries. It is an indicator of coronary inflammation and\nassociated with coronary artery disease. Non-invasive coronary CT angiography\n(CCTA) is presently used to obtain measures of the thickness, volume, and\nattenuation of fat deposition. However, prior works solely focus on measuring\nPCAT using semi-automated approaches at the right coronary artery (RCA) over\nthe left coronary artery (LCA). In this pilot work, we developed a fully\nautomated approach for the measurement of PCAT mean attenuation and volume in\nthe region around both coronary arteries. First, we used a large subset of\npatients from the public ImageCAS dataset (n = 735) to train a 3D full\nresolution nnUNet to segment LCA and RCA. Then, we automatically measured PCAT\nin the surrounding arterial regions. We evaluated our method on a held-out test\nset of patients (n = 183) from the same dataset. A mean Dice score of 83% and\nPCAT attenuation of -73.81 $\\pm$ 12.69 HU was calculated for the RCA, while a\nmean Dice score of 81% and PCAT attenuation of -77.51 $\\pm$ 7.94 HU was\ncomputed for the LCA. To the best of our knowledge, we are the first to develop\na fully automated method to measure PCAT attenuation and volume at both the RCA\nand LCA. Our work underscores how automated PCAT measurement holds promise as a\nbiomarker for identification of inflammation and cardiac disease.\n","authors":["Andrew M. Nguyen","Tejas Sudharshan Mathai","Liangchen Liu","Jianfei Liu","Ronald M. Summers"],"pdf_url":"https://arxiv.org/pdf/2311.13100v1.pdf","comment":"5 pages, 4 figures, IEE ISBI2024 conference"},{"id":"http://arxiv.org/abs/2311.13099v1","updated":"2023-11-22T01:58:26Z","published":"2023-11-22T01:58:26Z","title":"PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF","summary":" We show that physics-based simulations can be seamlessly integrated with NeRF\nto generate high-quality elastodynamics of real-world objects. Unlike existing\nmethods, we discretize nonlinear hyperelasticity in a meshless way, obviating\nthe necessity for intermediate auxiliary shape proxies like a tetrahedral mesh\nor voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed\nto capture nonlinear dynamics and large deformation on the implicit model. Such\nmeshless integration enables versatile simulations of complex and codimensional\nshapes. We adaptively place the least-square kernels according to the NeRF\ndensity field to significantly reduce the complexity of the nonlinear\nsimulation. As a result, physically realistic animations can be conveniently\nsynthesized using our method for a wide range of hyperelastic materials at an\ninteractive rate. For more information, please visit our project page at\nhttps://fytalon.github.io/pienerf/.\n","authors":["Yutao Feng","Yintong Shang","Xuan Li","Tianjia Shao","Chenfanfu Jiang","Yin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.13099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13091v1","updated":"2023-11-22T01:43:57Z","published":"2023-11-22T01:43:57Z","title":"Stable Unlearnable Example: Enhancing the Robustness of Unlearnable\n Examples via Stable Error-Minimizing Noise","summary":" The open source of large amounts of image data promotes the development of\ndeep learning techniques. Along with this comes the privacy risk of these\nopen-source image datasets being exploited by unauthorized third parties to\ntrain deep learning models for commercial or illegal purposes. To avoid the\nabuse of public data, a poisoning-based technique, the unlearnable example, is\nproposed to significantly degrade the generalization performance of models by\nadding a kind of imperceptible noise to the data. To further enhance its\nrobustness against adversarial training, existing works leverage iterative\nadversarial training on both the defensive noise and the surrogate model.\nHowever, it still remains unknown whether the robustness of unlearnable\nexamples primarily comes from the effect of enhancement in the surrogate model\nor the defensive noise. Observing that simply removing the adversarial noise on\nthe training process of the defensive noise can improve the performance of\nrobust unlearnable examples, we identify that solely the surrogate model's\nrobustness contributes to the performance. Furthermore, we found a negative\ncorrelation exists between the robustness of defensive noise and the protection\nperformance, indicating defensive noise's instability issue. Motivated by this,\nto further boost the robust unlearnable example, we introduce stable\nerror-minimizing noise (SEM), which trains the defensive noise against random\nperturbation instead of the time-consuming adversarial perturbation to improve\nthe stability of defensive noise. Through extensive experiments, we demonstrate\nthat SEM achieves a new state-of-the-art performance on CIFAR-10, CIFAR-100,\nand ImageNet Subset in terms of both effectiveness and efficiency. The code is\navailable at https://github.com/liuyixin-louis/Stable-Unlearnable-Example.\n","authors":["Yixin Liu","Kaidi Xu","Xun Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13091v1.pdf","comment":"14 pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2311.13090v1","updated":"2023-11-22T01:42:23Z","published":"2023-11-22T01:42:23Z","title":"On the Limitation of Diffusion Models for Synthesizing Training Datasets","summary":" Synthetic samples from diffusion models are promising for leveraging in\ntraining discriminative models as replications of real training datasets.\nHowever, we found that the synthetic datasets degrade classification\nperformance over real datasets even when using state-of-the-art diffusion\nmodels. This means that modern diffusion models do not perfectly represent the\ndata distribution for the purpose of replicating datasets for training\ndiscriminative tasks. This paper investigates the gap between synthetic and\nreal samples by analyzing the synthetic samples reconstructed from real samples\nthrough the diffusion and reverse process. By varying the time steps starting\nthe reverse process in the reconstruction, we can control the trade-off between\nthe information in the original real data and the information added by\ndiffusion models. Through assessing the reconstructed samples and trained\nmodels, we found that the synthetic data are concentrated in modes of the\ntraining data distribution as the reverse step increases, and thus, they are\ndifficult to cover the outer edges of the distribution. Our findings imply that\nmodern diffusion models are insufficient to replicate training data\ndistribution perfectly, and there is room for the improvement of generative\nmodeling in the replication of training datasets.\n","authors":["Shin'ya Yamaguchi","Takuma Fukuda"],"pdf_url":"https://arxiv.org/pdf/2311.13090v1.pdf","comment":"NeurIPS 2023 SyntheticData4ML Workshop"},{"id":"http://arxiv.org/abs/2309.04153v2","updated":"2023-11-22T01:36:14Z","published":"2023-09-08T06:37:25Z","title":"Mapping EEG Signals to Visual Stimuli: A Deep Learning Approach to Match\n vs. Mismatch Classification","summary":" Existing approaches to modeling associations between visual stimuli and brain\nresponses are facing difficulties in handling between-subject variance and\nmodel generalization. Inspired by the recent progress in modeling speech-brain\nresponse, we propose in this work a \"match-vs-mismatch\" deep learning model to\nclassify whether a video clip induces excitatory responses in recorded EEG\nsignals and learn associations between the visual content and corresponding\nneural recordings. Using an exclusive experimental dataset, we demonstrate that\nthe proposed model is able to achieve the highest accuracy on unseen subjects\nas compared to other baseline models. Furthermore, we analyze the inter-subject\nnoise using a subject-level silhouette score in the embedding space and show\nthat the developed model is able to mitigate inter-subject noise and\nsignificantly reduce the silhouette score. Moreover, we examine the Grad-CAM\nactivation score and show that the brain regions associated with language\nprocessing contribute most to the model predictions, followed by regions\nassociated with visual processing. These results have the potential to\nfacilitate the development of neural recording-based video reconstruction and\nits related applications.\n","authors":["Yiqian Yang","Zhengqiao Zhao","Qian Wang","Yan Yang","Jingdong Chen"],"pdf_url":"https://arxiv.org/pdf/2309.04153v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13073v1","updated":"2023-11-22T00:26:15Z","published":"2023-11-22T00:26:15Z","title":"FusionFrames: Efficient Architectural Aspects for Text-to-Video\n Generation Pipeline","summary":" Multimedia generation approaches occupy a prominent place in artificial\nintelligence research. Text-to-image models achieved high-quality results over\nthe last few years. However, video synthesis methods recently started to\ndevelop. This paper presents a new two-stage latent diffusion text-to-video\ngeneration architecture based on the text-to-image diffusion model. The first\nstage concerns keyframes synthesis to figure the storyline of a video, while\nthe second one is devoted to interpolation frames generation to make movements\nof the scene and objects smooth. We compare several temporal conditioning\napproaches for keyframes generation. The results show the advantage of using\nseparate temporal blocks over temporal layers in terms of metrics reflecting\nvideo generation quality aspects and human preference. The design of our\ninterpolation model significantly reduces computational costs compared to other\nmasked frame interpolation approaches. Furthermore, we evaluate different\nconfigurations of MoVQ-based video decoding scheme to improve consistency and\nachieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our\npipeline with existing solutions and achieve top-2 scores overall and top-1\namong open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page:\nhttps://ai-forever.github.io/kandinsky-video/\n","authors":["Vladimir Arkhipkin","Zein Shaheen","Viacheslav Vasilev","Elizaveta Dakhova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2311.13073v1.pdf","comment":"Project page: https://ai-forever.github.io/kandinsky-video/"},{"id":"http://arxiv.org/abs/2311.13069v1","updated":"2023-11-22T00:03:16Z","published":"2023-11-22T00:03:16Z","title":"FuseNet: Self-Supervised Dual-Path Network for Medical Image\n Segmentation","summary":" Semantic segmentation, a crucial task in computer vision, often relies on\nlabor-intensive and costly annotated datasets for training. In response to this\nchallenge, we introduce FuseNet, a dual-stream framework for self-supervised\nsemantic segmentation that eliminates the need for manual annotation. FuseNet\nleverages the shared semantic dependencies between the original and augmented\nimages to create a clustering space, effectively assigning pixels to\nsemantically related clusters, and ultimately generating the segmentation map.\nAdditionally, FuseNet incorporates a cross-modal fusion technique that extends\nthe principles of CLIP by replacing textual data with augmented images. This\napproach enables the model to learn complex visual representations, enhancing\nrobustness against variations similar to CLIP's text invariance. To further\nimprove edge alignment and spatial consistency between neighboring pixels, we\nintroduce an edge refinement loss. This loss function considers edge\ninformation to enhance spatial coherence, facilitating the grouping of nearby\npixels with similar visual features. Extensive experiments on skin lesion and\nlung segmentation datasets demonstrate the effectiveness of our method.\n\\href{https://github.com/xmindflow/FuseNet}{Codebase.}\n","authors":["Amirhossein Kazerouni","Sanaz Karimijafarbigloo","Reza Azad","Yury Velichko","Ulas Bagci","Dorit Merhof"],"pdf_url":"https://arxiv.org/pdf/2311.13069v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.13565v1","updated":"2023-11-22T18:22:56Z","published":"2023-11-22T18:22:56Z","title":"Drilling Down into the Discourse Structure with LLMs for Long Document\n Question Answering","summary":" We address the task of evidence retrieval for long document question\nanswering, which involves locating relevant paragraphs within a document to\nanswer a question. We aim to assess the applicability of large language models\n(LLMs) in the task of zero-shot long document evidence retrieval, owing to\ntheir unprecedented performance across various NLP tasks. However, currently\nthe LLMs can consume limited context lengths as input, thus providing document\nchunks as inputs might overlook the global context while missing out on\ncapturing the inter-segment dependencies. Moreover, directly feeding the large\ninput sets can incur significant computational costs, particularly when\nprocessing the entire document (and potentially incurring monetary expenses\nwith enterprise APIs like OpenAI's GPT variants). To address these challenges,\nwe propose a suite of techniques that exploit the discourse structure commonly\nfound in documents. By utilizing this structure, we create a condensed\nrepresentation of the document, enabling a more comprehensive understanding and\nanalysis of relationships between different parts. We retain $99.6\\%$ of the\nbest zero-shot approach's performance, while processing only $26\\%$ of the\ntotal tokens used by the best approach in the information seeking evidence\nretrieval setup. We also show how our approach can be combined with\n\\textit{self-ask} reasoning agent to achieve best zero-shot performance in\ncomplex multi-hop question answering, just $\\approx 4\\%$ short of zero-shot\nperformance using gold evidence.\n","authors":["Inderjeet Nair","Shwetha Somasundaram","Apoorv Saxena","Koustava Goswami"],"pdf_url":"https://arxiv.org/pdf/2311.13565v1.pdf","comment":"Accepted to the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.13534v1","updated":"2023-11-22T17:14:54Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose a novel method which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging (namely LM-Cocktail), where the fine-tuned language model\nis merged with the pre-trained base model or the peer models from other domains\nthrough weighted average. Despite simplicity, LM-Cocktail is surprisingly\neffective: the resulted model is able to achieve a strong empirical performance\nin the whole scope of general tasks while preserving a superior capacity in its\ntargeted domain. We conduct comprehensive experiments with LLama and BGE model\non popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13494v1","updated":"2023-11-22T16:12:41Z","published":"2023-11-22T16:12:41Z","title":"A Comparative Analysis of Supportive Navigation on Movie Recommenders","summary":" This literature review covers the research and thought process that went into\nmaking a solution for the infinite scrolling problem faced in streaming\nservices such as Netflix. Using the data collected, we have come to the\nconclusion that an alternate layout can somewhat alleviate the problems it\ntakes in navigating a list of movies. We also found out by a comparative\nanalysis that some layouts, the circular one in particular, is advantageous in\ncertain settings making it an ideal candidate for a movie recommender system.\n","authors":["Mohammad Sualeh Ali","Muhammed Maaz Tariq","Alina Ahmed","Abdul Razaque Soomro","Danysh Syed"],"pdf_url":"https://arxiv.org/pdf/2311.13494v1.pdf","comment":"This was an extensive survey and prototyping we did to purpose and\n alternative user interface for movie recommender systems like Netflix"},{"id":"http://arxiv.org/abs/2311.13350v1","updated":"2023-11-22T12:39:28Z","published":"2023-11-22T12:39:28Z","title":"Fact-based Court Judgment Prediction","summary":" This extended abstract extends the research presented in \"ILDC for CJPE:\nIndian Legal Documents Corpus for Court Judgment Prediction and Explanation\"\n\\cite{malik-etal-2021-ildc}, focusing on fact-based judgment prediction within\nthe context of Indian legal documents. We introduce two distinct problem\nvariations: one based solely on facts, and another combining facts with rulings\nfrom lower courts (RLC). Our research aims to enhance early-phase case outcome\nprediction, offering significant benefits to legal professionals and the\ngeneral public. The results, however, indicated a performance decline compared\nto the original ILDC for CJPE study, even after implementing various weightage\nschemes in our DELSumm algorithm. Additionally, using only facts for legal\njudgment prediction with different transformer models yielded results inferior\nto the state-of-the-art outcomes reported in the \"ILDC for CJPE\" study.\n","authors":["Shubham Kumar Nigam","Aniket Deroy"],"pdf_url":"https://arxiv.org/pdf/2311.13350v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13277v1","updated":"2023-11-22T09:53:57Z","published":"2023-11-22T09:53:57Z","title":"Hierarchical Matrix Factorization for Interpretable Collaborative\n Filtering","summary":" Matrix factorization (MF) is a simple collaborative filtering technique that\nachieves superior recommendation accuracy by decomposing the user-item rating\nmatrix into user and item latent matrices. This approach relies on learning\nfrom user-item interactions, which may not effectively capture the underlying\nshared dependencies between users or items. Therefore, there is scope to\nexplicitly capture shared dependencies to further improve recommendation\naccuracy and the interpretability of learning results by summarizing user-item\ninteractions. Based on these insights, we propose \"Hierarchical Matrix\nFactorization\" (HMF), which incorporates clustering concepts to capture the\nhierarchy, where leaf nodes and other nodes correspond to users/items and\nclusters, respectively. Central to our approach, called hierarchical\nembeddings, is the additional decomposition of the user and item latent\nmatrices (embeddings) into probabilistic connection matrices, which link the\nhierarchy, and a root cluster latent matrix. Thus, each node is represented by\nthe weighted average of the embeddings of its parent clusters. The embeddings\nare differentiable, allowing simultaneous learning of interactions and\nclustering using a single gradient descent method. Furthermore, the obtained\ncluster-specific interactions naturally summarize user-item interactions and\nprovide interpretability. Experimental results on rating and ranking\npredictions demonstrated the competitiveness of HMF over vanilla and\nhierarchical MF methods, especially its robustness in sparse interactions.\nAdditionally, it was confirmed that the clustering integration of HMF has the\npotential for faster learning convergence and mitigation of overfitting\ncompared to MF, and also provides interpretability through a cluster-centered\ncase study.\n","authors":["Kai Sugahara","Kazushi Okamoto"],"pdf_url":"https://arxiv.org/pdf/2311.13277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13121v1","updated":"2023-11-22T02:49:14Z","published":"2023-11-22T02:49:14Z","title":"GENET: Unleashing the Power of Side Information for Recommendation via\n Hypergraph Pre-training","summary":" Recommendation with side information has drawn significant research interest\ndue to its potential to mitigate user feedback sparsity. However, existing\nmodels struggle with generalization across diverse domains and types of side\ninformation. In particular, three challenges have not been addressed, and they\nare (1) the diverse formats of side information, including text sequences. (2)\nThe diverse semantics of side information that describes items and users from\nmulti-level in a context different from recommendation systems. (3) The diverse\ncorrelations in side information to measure similarity over multiple objects\nbeyond pairwise relations. In this paper, we introduce GENET (Generalized\nhypErgraph pretraiNing on sidE informaTion), which pre-trains user and item\nrepresentations on feedback-irrelevant side information and fine-tunes the\nrepresentations on user feedback data. GENET leverages pre-training as a means\nto prevent side information from overshadowing critical ID features and\nfeedback signals. It employs a hypergraph framework to accommodate various\ntypes of diverse side information. During pre-training, GENET integrates tasks\nfor hyperlink prediction and self-supervised contrast to capture fine-grained\nsemantics at both local and global levels. Additionally, it introduces a unique\nstrategy to enhance pre-training robustness by perturbing positive samples\nwhile maintaining high-order relations. Extensive experiments demonstrate that\nGENET exhibits strong generalization capabilities, outperforming the SOTA\nmethod by up to 38% in TOP-N recommendation and Sequential recommendation tasks\non various datasets with different side information.\n","authors":["Yang Li","Qi'ao Zhao","Chen Lin","Zhenjie Zhang","Xiaomin Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.13121v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13626v1","updated":"2023-11-22T17:36:46Z","published":"2023-11-22T17:36:46Z","title":"Physics-driven generative adversarial networks empower single-pixel\n infrared hyperspectral imaging","summary":" A physics-driven generative adversarial network (GAN) was established here\nfor single-pixel hyperspectral imaging (HSI) in the infrared spectrum, to\neliminate the extensive data training work required by traditional data-driven\nmodel. Within the GAN framework, the physical process of single-pixel imaging\n(SPI) was integrated into the generator, and the actual and estimated\none-dimensional (1D) bucket signals were employed as constraints in the\nobjective function to update the network's parameters and optimize the\ngenerator with the assistance of the discriminator. In comparison to\nsingle-pixel infrared HSI methods based on compressed sensing and\nphysics-driven convolution neural networks, our physics-driven GAN-based\nsingle-pixel infrared HSI can achieve higher imaging performance but with fewer\nmeasurements. We believe that this physics-driven GAN will promote practical\napplications of computational imaging, especially various SPI-based techniques.\n","authors":["Dong-Yin Wang","Shu-Hang Bie","Xi-Hao Chen","Wen-Kai Yu"],"pdf_url":"https://arxiv.org/pdf/2311.13626v1.pdf","comment":"14 pages, 8 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.13601v1","updated":"2023-11-22T18:59:48Z","published":"2023-11-22T18:59:48Z","title":"Visual In-Context Prompting","summary":" In-context prompting in large language models (LLMs) has become a prevalent\napproach to improve zero-shot capabilities, but this idea is less explored in\nthe vision domain. Existing visual prompting methods focus on referring\nsegmentation to segment the most relevant object, falling short of addressing\nmany generic vision tasks like open-set segmentation and detection. In this\npaper, we introduce a universal visual in-context prompting framework for both\ntasks. In particular, we build on top of an encoder-decoder architecture, and\ndevelop a versatile prompt encoder to support a variety of prompts like\nstrokes, boxes, and points. We further enhance it to take an arbitrary number\nof reference image segments as the context. Our extensive explorations show\nthat the proposed visual in-context prompting elicits extraordinary referring\nand generic segmentation capabilities to refer and detect, yielding competitive\nperformance to close-set in-domain datasets and showing promising results on\nmany open-set segmentation datasets. By joint training on COCO and SA-1B, our\nmodel achieves $57.7$ PQ on COCO and $23.2$ PQ on ADE20K. Code will be\navailable at https://github.com/UX-Decoder/DINOv.\n","authors":["Feng Li","Qing Jiang","Hao Zhang","Tianhe Ren","Shilong Liu","Xueyan Zou","Huaizhe Xu","Hongyang Li","Chunyuan Li","Jianwei Yang","Lei Zhang","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2311.13601v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2311.13600v1","updated":"2023-11-22T18:59:36Z","published":"2023-11-22T18:59:36Z","title":"ZipLoRA: Any Subject in Any Style by Effectively Merging LoRAs","summary":" Methods for finetuning generative models for concept-driven personalization\ngenerally achieve strong results for subject-driven or style-driven generation.\nRecently, low-rank adaptations (LoRA) have been proposed as a\nparameter-efficient way of achieving concept-driven personalization. While\nrecent work explores the combination of separate LoRAs to achieve joint\ngeneration of learned styles and subjects, existing techniques do not reliably\naddress the problem; they often compromise either subject fidelity or style\nfidelity. We propose ZipLoRA, a method to cheaply and effectively merge\nindependently trained style and subject LoRAs in order to achieve generation of\nany user-provided subject in any user-provided style. Experiments on a wide\nrange of subject and style combinations show that ZipLoRA can generate\ncompelling results with meaningful improvements over baselines in subject and\nstyle fidelity while preserving the ability to recontextualize. Project page:\nhttps://ziplora.github.io\n","authors":["Viraj Shah","Nataniel Ruiz","Forrester Cole","Erika Lu","Svetlana Lazebnik","Yuanzhen Li","Varun Jampani"],"pdf_url":"https://arxiv.org/pdf/2311.13600v1.pdf","comment":"Project page: https://ziplora.github.io"},{"id":"http://arxiv.org/abs/2311.13595v1","updated":"2023-11-22T18:55:27Z","published":"2023-11-22T18:55:27Z","title":"Covariance alignment: from maximum likelihood estimation to\n Gromov-Wasserstein","summary":" Feature alignment methods are used in many scientific disciplines for data\npooling, annotation, and comparison. As an instance of a permutation learning\nproblem, feature alignment presents significant statistical and computational\nchallenges. In this work, we propose the covariance alignment model to study\nand compare various alignment methods and establish a minimax lower bound for\ncovariance alignment that has a non-standard dimension scaling because of the\npresence of a nuisance parameter. This lower bound is in fact minimax optimal\nand is achieved by a natural quasi MLE. However, this estimator involves a\nsearch over all permutations which is computationally infeasible even when the\nproblem has moderate size. To overcome this limitation, we show that the\ncelebrated Gromov-Wasserstein algorithm from optimal transport which is more\namenable to fast implementation even on large-scale problems is also minimax\noptimal. These results give the first statistical justification for the\ndeployment of the Gromov-Wasserstein algorithm in practice.\n","authors":["Yanjun Han","Philippe Rigollet","George Stepaniants"],"pdf_url":"https://arxiv.org/pdf/2311.13595v1.pdf","comment":"41 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.13594v1","updated":"2023-11-22T18:55:25Z","published":"2023-11-22T18:55:25Z","title":"Labeling Neural Representations with Inverse Recognition","summary":" Deep Neural Networks (DNNs) demonstrated remarkable capabilities in learning\ncomplex hierarchical data representations, but the nature of these\nrepresentations remains largely unknown. Existing global explainability\nmethods, such as Network Dissection, face limitations such as reliance on\nsegmentation masks, lack of statistical significance testing, and high\ncomputational demands. We propose Inverse Recognition (INVERT), a scalable\napproach for connecting learned representations with human-understandable\nconcepts by leveraging their capacity to discriminate between these concepts.\nIn contrast to prior work, INVERT is capable of handling diverse types of\nneurons, exhibits less computational complexity, and does not rely on the\navailability of segmentation masks. Moreover, INVERT provides an interpretable\nmetric assessing the alignment between the representation and its corresponding\nexplanation and delivering a measure of statistical significance, emphasizing\nits utility and credibility. We demonstrate the applicability of INVERT in\nvarious scenarios, including the identification of representations affected by\nspurious correlations, and the interpretation of the hierarchical structure of\ndecision-making within the models.\n","authors":["Kirill Bykov","Laura Kopf","Shinichi Nakajima","Marius Kloft","Marina M. -C. Höhne"],"pdf_url":"https://arxiv.org/pdf/2311.13594v1.pdf","comment":"24 pages, 16 figures"},{"id":"http://arxiv.org/abs/2311.13589v1","updated":"2023-11-22T18:50:06Z","published":"2023-11-22T18:50:06Z","title":"Risk-sensitive Markov Decision Process and Learning under General\n Utility Functions","summary":" Reinforcement Learning (RL) has gained substantial attention across diverse\napplication domains and theoretical investigations. Existing literature on RL\ntheory largely focuses on risk-neutral settings where the decision-maker learns\nto maximize the expected cumulative reward. However, in practical scenarios\nsuch as portfolio management and e-commerce recommendations, decision-makers\noften persist in heterogeneous risk preferences subject to outcome\nuncertainties, which can not be well-captured by the risk-neural framework.\nIncorporating these preferences can be approached through utility theory, yet\nthe development of risk-sensitive RL under general utility functions remains an\nopen question for theoretical exploration.\n In this paper, we consider a scenario where the decision-maker seeks to\noptimize a general utility function of the cumulative reward in the framework\nof a Markov decision process (MDP). To facilitate the Dynamic Programming\nPrinciple and Bellman equation, we enlarge the state space with an additional\ndimension that accounts for the cumulative reward. We propose a discretized\napproximation scheme to the MDP under enlarged state space, which is tractable\nand key for algorithmic design. We then propose a modified value iteration\nalgorithm that employs an epsilon-covering over the space of cumulative reward.\nWhen a simulator is accessible, our algorithm efficiently learns a near-optimal\npolicy with guaranteed sample complexity. In the absence of a simulator, our\nalgorithm, designed with an upper-confidence-bound exploration approach,\nidentifies a near-optimal policy while ensuring a guaranteed regret bound. For\nboth algorithms, we match the theoretical lower bounds for the risk-neutral\nsetting.\n","authors":["Zhengqi Wu","Renyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2311.13589v1.pdf","comment":"36 pages"},{"id":"http://arxiv.org/abs/2311.13587v1","updated":"2023-11-22T18:46:05Z","published":"2023-11-22T18:46:05Z","title":"A Survey of Serverless Machine Learning Model Inference","summary":" Recent developments in Generative AI, Computer Vision, and Natural Language\nProcessing have led to an increased integration of AI models into various\nproducts. This widespread adoption of AI requires significant efforts in\ndeploying these models in production environments. When hosting machine\nlearning models for real-time predictions, it is important to meet defined\nService Level Objectives (SLOs), ensuring reliability, minimal downtime, and\noptimizing operational costs of the underlying infrastructure. Large machine\nlearning models often demand GPU resources for efficient inference to meet\nSLOs. In the context of these trends, there is growing interest in hosting AI\nmodels in a serverless architecture while still providing GPU access for\ninference tasks. This survey aims to summarize and categorize the emerging\nchallenges and optimization opportunities for large-scale deep learning serving\nsystems. By providing a novel taxonomy and summarizing recent trends, we hope\nthat this survey could shed light on new optimization perspectives and motivate\nnovel works in large-scale deep learning serving systems.\n","authors":["Kamil Kojs"],"pdf_url":"https://arxiv.org/pdf/2311.13587v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.13584v1","updated":"2023-11-22T18:40:45Z","published":"2023-11-22T18:40:45Z","title":"On diffusion-based generative models and their error bounds: The\n log-concave case with full convergence estimates","summary":" We provide full theoretical guarantees for the convergence behaviour of\ndiffusion-based generative models under the assumption of strongly logconcave\ndata distributions while our approximating class of functions used for score\nestimation is made of Lipschitz continuous functions. We demonstrate via a\nmotivating example, sampling from a Gaussian distribution with unknown mean,\nthe powerfulness of our approach. In this case, explicit estimates are provided\nfor the associated optimization problem, i.e. score approximation, while these\nare combined with the corresponding sampling estimates. As a result, we obtain\nthe best known upper bound estimates in terms of key quantities of interest,\nsuch as the dimension and rates of convergence, for the Wasserstein-2 distance\nbetween the data distribution (Gaussian with unknown mean) and our sampling\nalgorithm.\n Beyond the motivating example and in order to allow for the use of a diverse\nrange of stochastic optimizers, we present our results using an $L^2$-accurate\nscore estimation assumption, which crucially is formed under an expectation\nwith respect to the stochastic optimizer and our novel auxiliary process that\nuses only known information. This approach yields the best known convergence\nrate for our sampling algorithm.\n","authors":["Stefano Bruno","Ying Zhang","Dong-Young Lim","Ömer Deniz Akyildiz","Sotirios Sabanis"],"pdf_url":"https://arxiv.org/pdf/2311.13584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13583v1","updated":"2023-11-22T18:40:18Z","published":"2023-11-22T18:40:18Z","title":"Adaptive Sampling for Deep Learning via Efficient Nonparametric Proxies","summary":" Data sampling is an effective method to improve the training speed of neural\nnetworks, with recent results demonstrating that it can even break the neural\nscaling laws. These results critically rely on high-quality scores to estimate\nthe importance of an input to the network. We observe that there are two\ndominant strategies: static sampling, where the scores are determined before\ntraining, and dynamic sampling, where the scores can depend on the model\nweights. Static algorithms are computationally inexpensive but less effective\nthan their dynamic counterparts, which can cause end-to-end slowdown due to\ntheir need to explicitly compute losses. To address this problem, we propose a\nnovel sampling distribution based on nonparametric kernel regression that\nlearns an effective importance score as the neural network trains. However,\nnonparametric regression models are too computationally expensive to accelerate\nend-to-end training. Therefore, we develop an efficient sketch-based\napproximation to the Nadaraya-Watson estimator. Using recent techniques from\nhigh-dimensional statistics and randomized algorithms, we prove that our\nNadaraya-Watson sketch approximates the estimator with exponential convergence\nguarantees. Our sampling algorithm outperforms the baseline in terms of\nwall-clock time and accuracy on four datasets.\n","authors":["Shabnam Daghaghi","Benjamin Coleman","Benito Geordie","Anshumali Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2311.13583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13580v1","updated":"2023-11-22T18:34:49Z","published":"2023-11-22T18:34:49Z","title":"$σ$-PCA: a unified neural model for linear and nonlinear principal\n component analysis","summary":" Linear principal component analysis (PCA), nonlinear PCA, and linear\nindependent component analysis (ICA) -- those are three methods with\nsingle-layer autoencoder formulations for learning linear transformations from\ndata. Linear PCA learns orthogonal transformations (rotations) that orient axes\nto maximise variance, but it suffers from a subspace rotational indeterminacy:\nit fails to find a unique rotation for axes that share the same variance. Both\nnonlinear PCA and linear ICA reduce the subspace indeterminacy from rotational\nto permutational by maximising statistical independence under the assumption of\nunit variance. The main difference between them is that nonlinear PCA only\nlearns rotations while linear ICA learns not just rotations but any linear\ntransformation with unit variance. The relationship between all three can be\nunderstood by the singular value decomposition of the linear ICA transformation\ninto a sequence of rotation, scale, rotation. Linear PCA learns the first\nrotation; nonlinear PCA learns the second. The scale is simply the inverse of\nthe standard deviations. The problem is that, in contrast to linear PCA,\nconventional nonlinear PCA cannot be used directly on the data to learn the\nfirst rotation, the first being special as it reduces dimensionality and orders\nby variances. In this paper, we have identified the cause, and as a solution we\npropose $\\sigma$-PCA: a unified neural model for linear and nonlinear PCA as\nsingle-layer autoencoders. One of its key ingredients: modelling not just the\nrotation but also the scale -- the variances. This model bridges the disparity\nbetween linear and nonlinear PCA. And so, like linear PCA, it can learn a\nsemi-orthogonal transformation that reduces dimensionality and orders by\nvariances, but, unlike linear PCA, it does not suffer from rotational\nindeterminacy.\n","authors":["Fahdi Kanavati","Lucy Katsnith","Masayuki Tsuneki"],"pdf_url":"https://arxiv.org/pdf/2311.13580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19274v2","updated":"2023-11-22T18:27:15Z","published":"2023-10-30T05:13:58Z","title":"Prediction of Effective Elastic Moduli of Rocks using Graph Neural\n Networks","summary":" This study presents a Graph Neural Networks (GNNs)-based approach for\npredicting the effective elastic moduli of rocks from their digital CT-scan\nimages. We use the Mapper algorithm to transform 3D digital rock images into\ngraph datasets, encapsulating essential geometrical information. These graphs,\nafter training, prove effective in predicting elastic moduli. Our GNN model\nshows robust predictive capabilities across various graph sizes derived from\nvarious subcube dimensions. Not only does it perform well on the test dataset,\nbut it also maintains high prediction accuracy for unseen rocks and unexplored\nsubcube sizes. Comparative analysis with Convolutional Neural Networks (CNNs)\nreveals the superior performance of GNNs in predicting unseen rock properties.\nMoreover, the graph representation of microstructures significantly reduces GPU\nmemory requirements (compared to the grid representation for CNNs), enabling\ngreater flexibility in the batch size selection. This work demonstrates the\npotential of GNN models in enhancing the prediction accuracy of rock properties\nand boosting the efficiency of digital rock analysis.\n","authors":["Jaehong Chung","Rasool Ahmad","WaiChing Sun","Wei Cai","Tapan Mukerji"],"pdf_url":"https://arxiv.org/pdf/2310.19274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13552v1","updated":"2023-11-22T17:50:00Z","published":"2023-11-22T17:50:00Z","title":"A Unified Framework for Trace-induced Quantum Kernels","summary":" Quantum kernel methods are promising candidates for achieving a practical\nquantum advantage for certain machine learning tasks. Similar to classical\nmachine learning, an exact form of a quantum kernel is expected to have a great\nimpact on the model performance. In this work we combine all trace-induced\nquantum kernels, including the commonly-used global fidelity and local\nprojected quantum kernels, into a common framework. We show how generalized\ntrace-induced quantum kernels can be constructed as combinations of the\nfundamental building blocks we coin \"Lego\" kernels, which impose an inductive\nbias on the resulting quantum models. We relate the expressive power and\ngeneralization ability to the number of non-zero weight Lego kernels and\npropose a systematic approach to increase the complexity of a quantum kernel\nmodel, leading to a new form of the local projected kernels that require fewer\nquantum resources in terms of the number of quantum gates and measurement\nshots. We show numerically that models based on local projected kernels can\nachieve comparable performance to the global fidelity quantum kernel. Our work\nunifies existing quantum kernels and provides a systematic framework to compare\ntheir properties.\n","authors":["Beng Yee Gan","Daniel Leykam","Supanut Thanasilp"],"pdf_url":"https://arxiv.org/pdf/2311.13552v1.pdf","comment":"12 + 15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.13548v1","updated":"2023-11-22T17:44:18Z","published":"2023-11-22T17:44:18Z","title":"Efficient Numerical Integration in Reproducing Kernel Hilbert Spaces via\n Leverage Scores Sampling","summary":" In this work we consider the problem of numerical integration, i.e.,\napproximating integrals with respect to a target probability measure using only\npointwise evaluations of the integrand. We focus on the setting in which the\ntarget distribution is only accessible through a set of $n$ i.i.d.\nobservations, and the integrand belongs to a reproducing kernel Hilbert space.\nWe propose an efficient procedure which exploits a small i.i.d. random subset\nof $m \\epsilon$, where $\\|\\mathbf p-\\mathbf q\\|_{A_k}$ denotes\nthe generalized ${A}_k$ distance between $\\mathbf p$ and $\\mathbf q$ --\nmeasuring the maximum discrepancy between the distributions over any collection\nof $k$ disjoint, axis-aligned rectangles. Our main result is the first\ncloseness tester for this problem with {\\em sub-learning} sample complexity in\nany fixed dimension and a nearly-matching sample complexity lower bound.\n In more detail, we provide a computationally efficient closeness tester with\nsample complexity $O\\left((k^{6/7}/ \\mathrm{poly}_d(\\epsilon))\n\\log^d(k)\\right)$. On the lower bound side, we establish a qualitatively\nmatching sample complexity lower bound of\n$\\Omega(k^{6/7}/\\mathrm{poly}(\\epsilon))$, even for $d=2$. These sample\ncomplexity bounds are surprising because the sample complexity of the problem\nin the univariate setting is $\\Theta(k^{4/5}/\\mathrm{poly}(\\epsilon))$. This\nhas the interesting consequence that the jump from one to two dimensions leads\nto a substantial increase in sample complexity, while increases beyond that do\nnot.\n As a corollary of our general $A_k$ tester, we obtain $d_{\\mathrm\nTV}$-closeness testers for pairs of $k$-histograms on $\\mathbb R^d$ over a\ncommon unknown partition, and pairs of uniform distributions supported on the\nunion of $k$ unknown disjoint axis-aligned rectangles.\n Both our algorithm and our lower bound make essential use of tools from\nRamsey theory.\n","authors":["Ilias Diakonikolas","Daniel M. Kane","Sihan Liu"],"pdf_url":"https://arxiv.org/pdf/2311.13154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09936v2","updated":"2023-11-22T04:29:33Z","published":"2023-08-19T07:53:43Z","title":"BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual\n Questions","summary":" Vision Language Models (VLMs), which extend Large Language Models (LLM) by\nincorporating visual understanding capability, have demonstrated significant\nadvancements in addressing open-ended visual question-answering (VQA) tasks.\nHowever, these models cannot accurately interpret images infused with text, a\ncommon occurrence in real-world scenarios. Standard procedures for extracting\ninformation from images often involve learning a fixed set of query embeddings.\nThese embeddings are designed to encapsulate image contexts and are later used\nas soft prompt inputs in LLMs. Yet, this process is limited to the token count,\npotentially curtailing the recognition of scenes with text-rich context. To\nimprove upon them, the present study introduces BLIVA: an augmented version of\nInstructBLIP with Visual Assistant. BLIVA incorporates the query embeddings\nfrom InstructBLIP and also directly projects encoded patch embeddings into the\nLLM, a technique inspired by LLaVA. This approach assists the model to capture\nintricate details potentially missed during the query decoding process.\nEmpirical evidence demonstrates that our model, BLIVA, significantly enhances\nperformance in processing text-rich VQA benchmarks (up to 17.76% in OCR-VQA\nbenchmark) and in undertaking general (not particularly text-rich) VQA\nbenchmarks (up to 7.9% in Visual Spatial Reasoning benchmark), comparing to our\nbaseline InstructBLIP. BLIVA demonstrates significant capability in decoding\nreal-world images, irrespective of text presence. To demonstrate the broad\nindustry applications enabled by BLIVA, we evaluate the model using a new\ndataset comprising YouTube thumbnails paired with question-answer sets across\n11 diverse categories. For researchers interested in further exploration, our\ncode and models are freely accessible at https://github.com/mlpc-ucsd/BLIVA.\n","authors":["Wenbo Hu","Yifan Xu","Yi Li","Weiyue Li","Zeyuan Chen","Zhuowen Tu"],"pdf_url":"https://arxiv.org/pdf/2308.09936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13147v1","updated":"2023-11-22T04:18:23Z","published":"2023-11-22T04:18:23Z","title":"Optimal Transport with Cyclic Symmetry","summary":" We propose novel fast algorithms for optimal transport (OT) utilizing a\ncyclic symmetry structure of input data. Such OT with cyclic symmetry appears\nuniversally in various real-world examples: image processing, urban planning,\nand graph processing. Our main idea is to reduce OT to a small optimization\nproblem that has significantly fewer variables by utilizing cyclic symmetry and\nvarious optimization techniques. On the basis of this reduction, our algorithms\nsolve the small optimization problem instead of the original OT. As a result,\nour algorithms obtain the optimal solution and the objective function value of\nthe original OT faster than solving the original OT directly. In this paper,\nour focus is on two crucial OT formulations: the linear programming OT (LOT)\nand the strongly convex-regularized OT, which includes the well-known\nentropy-regularized OT (EROT). Experiments show the effectiveness of our\nalgorithms for LOT and EROT in synthetic/real-world data that has a\nstrict/approximate cyclic symmetry structure. Through theoretical and\nexperimental results, this paper successfully introduces the concept of\nsymmetry into the OT research field for the first time.\n","authors":["Shoichiro Takeda","Yasunori Akagi","Naoki Marumo","Kenta Niwa"],"pdf_url":"https://arxiv.org/pdf/2311.13147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12068v2","updated":"2023-11-22T04:13:38Z","published":"2023-11-19T17:28:28Z","title":"Enhancing Novel Object Detection via Cooperative Foundational Models","summary":" In this work, we address the challenging and emergent problem of novel object\ndetection (NOD), focusing on the accurate detection of both known and novel\nobject categories during inference. Traditional object detection algorithms are\ninherently closed-set, limiting their capability to handle NOD. We present a\nnovel approach to transform existing closed-set detectors into open-set\ndetectors. This transformation is achieved by leveraging the complementary\nstrengths of pre-trained foundational models, specifically CLIP and SAM,\nthrough our cooperative mechanism. Furthermore, by integrating this mechanism\nwith state-of-the-art open-set detectors such as GDINO, we establish new\nbenchmarks in object detection performance. Our method achieves 17.42 mAP in\nnovel object detection and 42.08 mAP for known objects on the challenging LVIS\ndataset. Adapting our approach to the COCO OVD split, we surpass the current\nstate-of-the-art by a margin of 7.2 $ \\text{AP}_{50} $ for novel classes. Our\ncode is available at\nhttps://github.com/rohit901/cooperative-foundational-models .\n","authors":["Rohit Bharadwaj","Muzammal Naseer","Salman Khan","Fahad Shahbaz Khan"],"pdf_url":"https://arxiv.org/pdf/2311.12068v2.pdf","comment":"Code: https://github.com/rohit901/cooperative-foundational-models"},{"id":"http://arxiv.org/abs/2309.11983v2","updated":"2023-11-22T04:10:53Z","published":"2023-09-21T11:39:33Z","title":"Variational Connectionist Temporal Classification for Order-Preserving\n Sequence Modeling","summary":" Connectionist temporal classification (CTC) is commonly adopted for sequence\nmodeling tasks like speech recognition, where it is necessary to preserve order\nbetween the input and target sequences. However, CTC is only applied to\ndeterministic sequence models, where the latent space is discontinuous and\nsparse, which in turn makes them less capable of handling data variability when\ncompared to variational models. In this paper, we integrate CTC with a\nvariational model and derive loss functions that can be used to train more\ngeneralizable sequence models that preserve order. Specifically, we derive two\nversions of the novel variational CTC based on two reasonable assumptions, the\nfirst being that the variational latent variables at each time step are\nconditionally independent; and the second being that these latent variables are\nMarkovian. We show that both loss functions allow direct optimization of the\nvariational lower bound for the model log-likelihood, and present\ncomputationally tractable forms for implementing them.\n","authors":["Zheng Nan","Ting Dang","Vidhyasaharan Sethu","Beena Ahmed"],"pdf_url":"https://arxiv.org/pdf/2309.11983v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.02645v4","updated":"2023-11-22T03:43:55Z","published":"2022-05-05T13:44:24Z","title":"Discovering stochastic dynamical equations from biological time series\n data","summary":" Stochastic differential equations (SDEs) are an important framework to model\ndynamics with randomness, as is common in most biological systems. The inverse\nproblem of integrating these models with empirical data remains a major\nchallenge. Here, we present a software package, PyDaDDy (Python Library for\nData Driven Dynamics) that takes time series data as an input and outputs an\ninterpretable SDE. We achieve this by combining traditional approaches from\nstochastic calculus literature with state-of-the-art equation discovery\ntechniques. We validate our approach on synthetic datasets, and demonstrate the\ngenerality and applicability of the method on two real-world datasets of vastly\ndifferent spatiotemporal scales: (i) collective movement of fish school where\nstochasticity plays a crucial role, and (ii) confined migration of a single\ncell, primarily following a relaxed oscillation. We make the method available\nas an easy-to-use, open-source Python package, PyDaddy (Python Library for Data\nDriven Dynamics).\n","authors":["Arshed Nabeel","Ashwin Karichannavar","Shuaib Palathingal","Jitesh Jhawar","David B. Brückner","Danny Raj M.","Vishwesha Guttal"],"pdf_url":"https://arxiv.org/pdf/2205.02645v4.pdf","comment":"15 pages (+ 9 page appendix), 6 figures (+ 8 appendix figures).\n Updates: v3: Significantly reorganized the paper and added a section analysis\n of a cell migration dataset. v4: Update arXiv title to match the updated\n title of the manuscript"},{"id":"http://arxiv.org/abs/2311.13133v1","updated":"2023-11-22T03:37:01Z","published":"2023-11-22T03:37:01Z","title":"LIMIT: Less Is More for Instruction Tuning Across Evaluation Paradigms","summary":" Large Language Models are traditionally finetuned on large instruction\ndatasets. However recent studies suggest that small, high-quality datasets can\nsuffice for general purpose instruction following. This lack of consensus\nsurrounding finetuning best practices is in part due to rapidly diverging\napproaches to LLM evaluation. In this study, we ask whether a small amount of\ndiverse finetuning samples can improve performance on both traditional\nperplexity-based NLP benchmarks, and on open-ended, model-based evaluation. We\nfinetune open-source MPT-7B and MPT-30B models on instruction finetuning\ndatasets of various sizes ranging from 1k to 60k samples. We find that subsets\nof 1k-6k instruction finetuning samples are sufficient to achieve good\nperformance on both (1) traditional NLP benchmarks and (2) model-based\nevaluation. Finally, we show that mixing textbook-style and open-ended QA\nfinetuning datasets optimizes performance on both evaluation paradigms.\n","authors":["Aditi Jha","Sam Havens","Jeremey Dohmann","Alex Trott","Jacob Portes"],"pdf_url":"https://arxiv.org/pdf/2311.13133v1.pdf","comment":"36 pages, 12 figures, NeurIPS 2023 Workshop on Instruction Tuning and\n Instruction Following"},{"id":"http://arxiv.org/abs/2306.08280v2","updated":"2023-11-22T03:22:18Z","published":"2023-06-14T06:35:10Z","title":"Differentially Private Wireless Federated Learning Using Orthogonal\n Sequences","summary":" We propose a privacy-preserving uplink over-the-air computation (AirComp)\nmethod, termed FLORAS, for single-input single-output (SISO) wireless federated\nlearning (FL) systems. From the perspective of communication designs, FLORAS\neliminates the requirement of channel state information at the transmitters\n(CSIT) by leveraging the properties of orthogonal sequences. From the privacy\nperspective, we prove that FLORAS offers both item-level and client-level\ndifferential privacy (DP) guarantees. Moreover, by properly adjusting the\nsystem parameters, FLORAS can flexibly achieve different DP levels at no\nadditional cost. A new FL convergence bound is derived which, combined with the\nprivacy guarantees, allows for a smooth tradeoff between the achieved\nconvergence rate and differential privacy levels. Experimental results\ndemonstrate the advantages of FLORAS compared with the baseline AirComp method,\nand validate that the analytical results can guide the design of\nprivacy-preserving FL with different tradeoff requirements on the model\nconvergence and privacy levels.\n","authors":["Xizixiang Wei","Tianhao Wang","Ruiquan Huang","Cong Shen","Jing Yang","H. Vincent Poor"],"pdf_url":"https://arxiv.org/pdf/2306.08280v2.pdf","comment":"33 pages, 5 figures"},{"id":"http://arxiv.org/abs/2306.04889v2","updated":"2023-11-22T03:02:46Z","published":"2023-06-08T02:35:30Z","title":"ShaDDR: Interactive Example-Based Geometry and Texture Generation via 3D\n Shape Detailization and Differentiable Rendering","summary":" We present ShaDDR, an example-based deep generative neural network which\nproduces a high-resolution textured 3D shape through geometry detailization and\nconditional texture generation applied to an input coarse voxel shape. Trained\non a small set of detailed and textured exemplar shapes, our method learns to\ndetailize the geometry via multi-resolution voxel upsampling and generate\ntextures on voxel surfaces via differentiable rendering against exemplar\ntexture images from a few views. The generation is interactive, taking less\nthan 1 second to produce a 3D model with voxel resolutions up to 512^3. The\ngenerated shape preserves the overall structure of the input coarse voxel\nmodel, while the style of the generated geometric details and textures can be\nmanipulated through learned latent codes. In the experiments, we show that our\nmethod can generate higher-resolution shapes with plausible and improved\ngeometric details and clean textures compared to prior works. Furthermore, we\nshowcase the ability of our method to learn geometric details and textures from\nshapes reconstructed from real-world photos. In addition, we have developed an\ninteractive modeling application to demonstrate the generalizability of our\nmethod to various user inputs and the controllability it offers, allowing users\nto interactively sculpt a coarse voxel shape to define the overall structure of\nthe detailized 3D shape. Code and data are available at\nhttps://github.com/qiminchen/ShaDDR.\n","authors":["Qimin Chen","Zhiqin Chen","Hang Zhou","Hao Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.04889v2.pdf","comment":"Accepted to SIGGRAPH Asia 2023 conference track. Code:\n https://github.com/qiminchen/ShaDDR"},{"id":"http://arxiv.org/abs/2311.10863v3","updated":"2023-11-22T03:01:59Z","published":"2023-11-17T20:51:24Z","title":"Verified Compositional Neuro-Symbolic Control for Stochastic Systems\n with Temporal Logic Tasks","summary":" Several methods have been proposed recently to learn neural network (NN)\ncontrollers for autonomous agents, with unknown and stochastic dynamics, tasked\nwith complex missions captured by Linear Temporal Logic (LTL). Due to the\nsample-inefficiency of the majority of these works, compositional learning\nmethods have been proposed decomposing the LTL specification into smaller\nsub-tasks. Then, separate controllers are learned and composed to satisfy the\noriginal task. A key challenge within these approaches is that they often lack\nsafety guarantees or the provided guarantees are impractical. This paper aims\nto address this challenge. Particularly, we consider autonomous systems with\nunknown and stochastic dynamics and LTL-encoded tasks. We assume that the\nsystem is equipped with a finite set of base skills modeled by trained NN\nfeedback controllers. Our goal is to check if there exists a temporal\ncomposition of the trained NN controllers - and if so, to compute it - that\nwill yield a composite system behavior that satisfies the assigned LTL task\nwith probability one. We propose a new approach that relies on a novel\nintegration of automata theory and data-driven reachability analysis tools for\nNN-controlled stochastic systems. The resulting neuro-symbolic controller\nallows the agent to generate safe behaviors for unseen complex temporal logic\ntasks in a zero-shot fashion by leveraging its base skills. We show correctness\nof the proposed method and we provide conditions under which it is complete. To\nthe best of our knowledge, this is the first work that designs verified\ntemporal compositions of NN controllers for unknown and stochastic systems.\nFinally, we provide extensive numerical simulations and hardware experiments on\nrobot navigation tasks to demonstrate the proposed method.\n","authors":["Jun Wang","Haojun Chen","Zihe Sun","Yiannis Kantaros"],"pdf_url":"https://arxiv.org/pdf/2311.10863v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2209.06130"},{"id":"http://arxiv.org/abs/2311.13118v1","updated":"2023-11-22T02:45:01Z","published":"2023-11-22T02:45:01Z","title":"Combatting Human Trafficking in the Cyberspace: A Natural Language\n Processing-Based Methodology to Analyze the Language in Online Advertisements","summary":" This project tackles the pressing issue of human trafficking in online C2C\nmarketplaces through advanced Natural Language Processing (NLP) techniques. We\nintroduce a novel methodology for generating pseudo-labeled datasets with\nminimal supervision, serving as a rich resource for training state-of-the-art\nNLP models. Focusing on tasks like Human Trafficking Risk Prediction (HTRP) and\nOrganized Activity Detection (OAD), we employ cutting-edge Transformer models\nfor analysis. A key contribution is the implementation of an interpretability\nframework using Integrated Gradients, providing explainable insights crucial\nfor law enforcement. This work not only fills a critical gap in the literature\nbut also offers a scalable, machine learning-driven approach to combat human\nexploitation online. It serves as a foundation for future research and\npractical applications, emphasizing the role of machine learning in addressing\ncomplex social issues.\n","authors":["Alejandro Rodriguez Perez","Pablo Rivas"],"pdf_url":"https://arxiv.org/pdf/2311.13118v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08131v2","updated":"2023-11-22T02:35:37Z","published":"2022-12-15T20:36:10Z","title":"Bridging the Gap Between Offline and Online Reinforcement Learning\n Evaluation Methodologies","summary":" Reinforcement learning (RL) has shown great promise with algorithms learning\nin environments with large state and action spaces purely from scalar reward\nsignals. A crucial challenge for current deep RL algorithms is that they\nrequire a tremendous amount of environment interactions for learning. This can\nbe infeasible in situations where such interactions are expensive; such as in\nrobotics. Offline RL algorithms try to address this issue by bootstrapping the\nlearning process from existing logged data without needing to interact with the\nenvironment from the very beginning. While online RL algorithms are typically\nevaluated as a function of the number of environment interactions, there exists\nno single established protocol for evaluating offline RL methods.In this paper,\nwe propose a sequential approach to evaluate offline RL algorithms as a\nfunction of the training set size and thus by their data efficiency. Sequential\nevaluation provides valuable insights into the data efficiency of the learning\nprocess and the robustness of algorithms to distribution changes in the dataset\nwhile also harmonizing the visualization of the offline and online learning\nphases. Our approach is generally applicable and easy to implement. We compare\nseveral existing offline RL algorithms using this approach and present insights\nfrom a variety of tasks and offline datasets.\n","authors":["Shivakanth Sujit","Pedro H. M. Braga","Jorg Bornschein","Samira Ebrahimi Kahou"],"pdf_url":"https://arxiv.org/pdf/2212.08131v2.pdf","comment":"TMLR 2023"},{"id":"http://arxiv.org/abs/2305.16854v3","updated":"2023-11-22T02:29:13Z","published":"2023-05-26T12:04:59Z","title":"Channel and Gradient-Importance Aware Device Scheduling for Over-the-Air\n Federated Learning","summary":" Federated learning (FL) is a popular privacy-preserving distributed training\nscheme, where multiple devices collaborate to train machine learning models by\nuploading local model updates. To improve communication efficiency,\nover-the-air computation (AirComp) has been applied to FL, which leverages\nanalog modulation to harness the superposition property of radio waves such\nthat numerous devices can upload their model updates concurrently for\naggregation. However, the uplink channel noise incurs considerable model\naggregation distortion, which is critically determined by the device scheduling\nand compromises the learned model performance. In this paper, we propose a\nprobabilistic device scheduling framework for over-the-air FL, named PO-FL, to\nmitigate the negative impact of channel noise, where each device is scheduled\naccording to a certain probability and its model update is reweighted using\nthis probability in aggregation. We prove the unbiasedness of this aggregation\nscheme and demonstrate the convergence of PO-FL on both convex and non-convex\nloss functions. Our convergence bounds unveil that the device scheduling\naffects the learning performance through the communication distortion and\nglobal update variance. Based on the convergence analysis, we further develop a\nchannel and gradient-importance aware algorithm to optimize the device\nscheduling probabilities in PO-FL. Extensive simulation results show that the\nproposed PO-FL framework with channel and gradient-importance awareness\nachieves faster convergence and produces better models than baseline methods.\n","authors":["Yuchang Sun","Zehong lin","Yuyi Mao","Shi Jin","Jun Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.16854v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v1","updated":"2023-11-22T02:23:32Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v1.pdf","comment":"This paper integrates the works arXiv:2306.01129 and\n arXiv:2308.16271, as well as this under-review work:\n https://openreview.net/forum?id=PvyOYleymy into a complete story. In this\n paper, we improve the writing and organization, and also add conceptual,\n empirical, and theoretical improvements over the previous work"},{"id":"http://arxiv.org/abs/2311.13102v1","updated":"2023-11-22T02:04:35Z","published":"2023-11-22T02:04:35Z","title":"Detecting out-of-distribution text using topological features of\n transformer-based language models","summary":" We attempt to detect out-of-distribution (OOD) text samples though applying\nTopological Data Analysis (TDA) to attention maps in transformer-based language\nmodels. We evaluate our proposed TDA-based approach for out-of-distribution\ndetection on BERT, a transformer-based language model, and compare the to a\nmore traditional OOD approach based on BERT CLS embeddings. We found that our\nTDA approach outperforms the CLS embedding approach at distinguishing\nin-distribution data (politics and entertainment news articles from HuffPost)\nfrom far out-of-domain samples (IMDB reviews), but its effectiveness\ndeteriorates with near out-of-domain (CNN/Dailymail) or same-domain (business\nnews articles from HuffPost) datasets.\n","authors":["Andres Pollano","Anupam Chaudhuri","Anj Simmons"],"pdf_url":"https://arxiv.org/pdf/2311.13102v1.pdf","comment":"12 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2311.13099v1","updated":"2023-11-22T01:58:26Z","published":"2023-11-22T01:58:26Z","title":"PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF","summary":" We show that physics-based simulations can be seamlessly integrated with NeRF\nto generate high-quality elastodynamics of real-world objects. Unlike existing\nmethods, we discretize nonlinear hyperelasticity in a meshless way, obviating\nthe necessity for intermediate auxiliary shape proxies like a tetrahedral mesh\nor voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed\nto capture nonlinear dynamics and large deformation on the implicit model. Such\nmeshless integration enables versatile simulations of complex and codimensional\nshapes. We adaptively place the least-square kernels according to the NeRF\ndensity field to significantly reduce the complexity of the nonlinear\nsimulation. As a result, physically realistic animations can be conveniently\nsynthesized using our method for a wide range of hyperelastic materials at an\ninteractive rate. For more information, please visit our project page at\nhttps://fytalon.github.io/pienerf/.\n","authors":["Yutao Feng","Yintong Shang","Xuan Li","Tianjia Shao","Chenfanfu Jiang","Yin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.13099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13094v1","updated":"2023-11-22T01:50:43Z","published":"2023-11-22T01:50:43Z","title":"Newton-CG methods for nonconvex unconstrained optimization with Hölder\n continuous Hessian","summary":" In this paper we consider a nonconvex unconstrained optimization problem\nminimizing a twice differentiable objective function with H\\\"older continuous\nHessian. Specifically, we first propose a Newton-conjugate gradient (Newton-CG)\nmethod for finding an approximate first-order stationary point (FOSP) of this\nproblem, assuming the associated the H\\\"older parameters are explicitly known.\nThen we develop a parameter-free Newton-CG method without requiring any prior\nknowledge of these parameters. To the best of our knowledge, this method is the\nfirst parameter-free second-order method achieving the best-known iteration and\noperation complexity for finding an approximate FOSP of this problem.\nFurthermore, we propose a Newton-CG method for finding an approximate\nsecond-order stationary point (SOSP) of the considered problem with high\nprobability and establish its iteration and operation complexity. Finally, we\npresent preliminary numerical results to demonstrate the superior practical\nperformance of our parameter-free Newton-CG method over a well-known\nregularized Newton method.\n","authors":["Chuan He","Zhaosong Lu"],"pdf_url":"https://arxiv.org/pdf/2311.13094v1.pdf","comment":"arXiv admin note: text overlap with arXiv:2301.03139"},{"id":"http://arxiv.org/abs/2311.13091v1","updated":"2023-11-22T01:43:57Z","published":"2023-11-22T01:43:57Z","title":"Stable Unlearnable Example: Enhancing the Robustness of Unlearnable\n Examples via Stable Error-Minimizing Noise","summary":" The open source of large amounts of image data promotes the development of\ndeep learning techniques. Along with this comes the privacy risk of these\nopen-source image datasets being exploited by unauthorized third parties to\ntrain deep learning models for commercial or illegal purposes. To avoid the\nabuse of public data, a poisoning-based technique, the unlearnable example, is\nproposed to significantly degrade the generalization performance of models by\nadding a kind of imperceptible noise to the data. To further enhance its\nrobustness against adversarial training, existing works leverage iterative\nadversarial training on both the defensive noise and the surrogate model.\nHowever, it still remains unknown whether the robustness of unlearnable\nexamples primarily comes from the effect of enhancement in the surrogate model\nor the defensive noise. Observing that simply removing the adversarial noise on\nthe training process of the defensive noise can improve the performance of\nrobust unlearnable examples, we identify that solely the surrogate model's\nrobustness contributes to the performance. Furthermore, we found a negative\ncorrelation exists between the robustness of defensive noise and the protection\nperformance, indicating defensive noise's instability issue. Motivated by this,\nto further boost the robust unlearnable example, we introduce stable\nerror-minimizing noise (SEM), which trains the defensive noise against random\nperturbation instead of the time-consuming adversarial perturbation to improve\nthe stability of defensive noise. Through extensive experiments, we demonstrate\nthat SEM achieves a new state-of-the-art performance on CIFAR-10, CIFAR-100,\nand ImageNet Subset in terms of both effectiveness and efficiency. The code is\navailable at https://github.com/liuyixin-louis/Stable-Unlearnable-Example.\n","authors":["Yixin Liu","Kaidi Xu","Xun Chen","Lichao Sun"],"pdf_url":"https://arxiv.org/pdf/2311.13091v1.pdf","comment":"14 pages, 11 figures, 13 tables"},{"id":"http://arxiv.org/abs/2311.12166v2","updated":"2023-11-22T01:42:59Z","published":"2023-11-20T20:32:14Z","title":"Creating Temporally Correlated High-Resolution Power Injection Profiles\n Using Physics-Aware GAN","summary":" Traditional smart meter measurements lack the granularity needed for\nreal-time decision-making. To address this practical problem, we create a\ngenerative adversarial networks (GAN) model that enforces temporal consistency\non its high-resolution outputs via hard inequality constraints using a convex\noptimization layer. A unique feature of our GAN model is that it is trained\nsolely on slow timescale aggregated power information obtained from historical\nsmart meter data. The results demonstrate that the model can successfully\ncreate minutely interval temporally-correlated instantaneous power injection\nprofiles from 15-minute average power consumption information. This innovative\napproach, emphasizing inter-neuron constraints, offers a promising avenue for\nimproved high-speed state estimation in distribution systems and enhances the\napplicability of data-driven solutions for monitoring such systems.\n","authors":["Hritik Gopal Shah","Behrouz Azimian","Anamitra Pal"],"pdf_url":"https://arxiv.org/pdf/2311.12166v2.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2310.12942v3","updated":"2023-11-22T01:39:59Z","published":"2023-10-19T17:39:47Z","title":"On the Representational Capacity of Recurrent Neural Language Models","summary":" This work investigates the computational expressivity of language models\n(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992)\nfamously showed that RNNs with rational weights and hidden states and unbounded\ncomputation time are Turing complete. However, LMs define weightings over\nstrings in addition to just (unweighted) language membership and the analysis\nof the computational power of RNN LMs (RLMs) should reflect this. We extend the\nTuring completeness result to the probabilistic case, showing how a rationally\nweighted RLM with unbounded computation time can simulate any deterministic\nprobabilistic Turing machine (PTM) with rationally weighted transitions. Since,\nin practice, RLMs work in real-time, processing a symbol at every time step, we\ntreat the above result as an upper bound on the expressivity of RLMs. We also\nprovide a lower bound by showing that under the restriction to real-time\ncomputation, such models can simulate deterministic real-time rational PTMs.\n","authors":["Franz Nowak","Anej Svete","Li Du","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2310.12942v3.pdf","comment":"To be published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.13087v1","updated":"2023-11-22T01:32:06Z","published":"2023-11-22T01:32:06Z","title":"Predict-Then-Optimize by Proxy: Learning Joint Models of Prediction and\n Optimization","summary":" Many real-world decision processes are modeled by optimization problems whose\ndefining parameters are unknown and must be inferred from observable data. The\nPredict-Then-Optimize framework uses machine learning models to predict unknown\nparameters of an optimization problem from features before solving. Recent\nworks show that decision quality can be improved in this setting by solving and\ndifferentiating the optimization problem in the training loop, enabling\nend-to-end training with loss functions defined directly on the resulting\ndecisions. However, this approach can be inefficient and requires handcrafted,\nproblem-specific rules for backpropagation through the optimization step. This\npaper proposes an alternative method, in which optimal solutions are learned\ndirectly from the observable features by predictive models. The approach is\ngeneric, and based on an adaptation of the Learning-to-Optimize paradigm, from\nwhich a rich variety of existing techniques can be employed. Experimental\nevaluations show the ability of several Learning-to-Optimize methods to provide\nefficient, accurate, and flexible solutions to an array of challenging\nPredict-Then-Optimize problems.\n","authors":["James Kotary","Vincenzo Di Vito","Jacob Christopher","Pascal Van Hentenryck","Ferdinando Fioretto"],"pdf_url":"https://arxiv.org/pdf/2311.13087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2108.00527v3","updated":"2023-11-22T01:11:46Z","published":"2021-08-01T19:20:34Z","title":"Gates Are Not What You Need in RNNs","summary":" Recurrent neural networks have flourished in many areas. Consequently, we can\nsee new RNN cells being developed continuously, usually by creating or using\ngates in a new, original way. But what if we told you that gates in RNNs are\nredundant? In this paper, we propose a new recurrent cell called Residual\nRecurrent Unit (RRU) which beats traditional cells and does not employ a single\ngate. It is based on the residual shortcut connection, linear transformations,\nReLU, and normalization. To evaluate our cell's effectiveness, we compare its\nperformance against the widely-used GRU and LSTM cells and the recently\nproposed Mogrifier LSTM on several tasks including, polyphonic music modeling,\nlanguage modeling, and sentiment analysis. Our experiments show that RRU\noutperforms the traditional gated units on most of these tasks. Also, it has\nbetter robustness to parameter selection, allowing immediate application in new\ntasks without much tuning. We have implemented the RRU in TensorFlow, and the\ncode is made available at https://github.com/LUMII-Syslab/RRU .\n","authors":["Ronalds Zakovskis","Andis Draguns","Eliza Gaile","Emils Ozolins","Karlis Freivalds"],"pdf_url":"https://arxiv.org/pdf/2108.00527v3.pdf","comment":"Published in Artificial Intelligence and Soft Computing. ICAISC 2023.\n Lecture Notes in Computer Science(), vol 14125. Springer, Cham., and is\n available online at https://doi.org/10.1007/978-3-031-42505-9_27"},{"id":"http://arxiv.org/abs/2311.13081v1","updated":"2023-11-22T01:06:45Z","published":"2023-11-22T01:06:45Z","title":"Learning to Fly in Seconds","summary":" Learning-based methods, particularly Reinforcement Learning (RL), hold great\npromise for streamlining deployment, enhancing performance, and achieving\ngeneralization in the control of autonomous multirotor aerial vehicles. Deep RL\nhas been able to control complex systems with impressive fidelity and agility\nin simulation but the simulation-to-reality transfer often brings a\nhard-to-bridge reality gap. Moreover, RL is commonly plagued by prohibitively\nlong training times. In this work, we propose a novel asymmetric\nactor-critic-based architecture coupled with a highly reliable RL-based\ntraining paradigm for end-to-end quadrotor control. We show how curriculum\nlearning and a highly optimized simulator enhance sample complexity and lead to\nfast training times. To precisely discuss the challenges related to\nlow-level/end-to-end multirotor control, we also introduce a taxonomy that\nclassifies the existing levels of control abstractions as well as\nnon-linearities and domain parameters. Our framework enables\nSimulation-to-Reality (Sim2Real) transfer for direct RPM control after only 18\nseconds of training on a consumer-grade laptop as well as its deployment on\nmicrocontrollers to control a multirotor under real-time guarantees. Finally,\nour solution exhibits competitive performance in trajectory tracking, as\ndemonstrated through various experimental comparisons with existing\nstate-of-the-art control solutions using a real Crazyflie nano quadrotor. We\nopen source the code including a very fast multirotor dynamics simulator that\ncan simulate about 5 months of flight per second on a laptop GPU. The fast\ntraining times and deployment to a cheap, off-the-shelf quadrotor lower the\nbarriers to entry and help democratize the research and development of these\nsystems.\n","authors":["Jonas Eschmann","Dario Albani","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2311.13081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06202v3","updated":"2023-11-22T00:57:54Z","published":"2023-06-09T19:10:16Z","title":"NeuroGraph: Benchmarks for Graph Machine Learning in Brain Connectomics","summary":" Machine learning provides a valuable tool for analyzing high-dimensional\nfunctional neuroimaging data, and is proving effective in predicting various\nneurological conditions, psychiatric disorders, and cognitive patterns. In\nfunctional magnetic resonance imaging (MRI) research, interactions between\nbrain regions are commonly modeled using graph-based representations. The\npotency of graph machine learning methods has been established across myriad\ndomains, marking a transformative step in data interpretation and predictive\nmodeling. Yet, despite their promise, the transposition of these techniques to\nthe neuroimaging domain has been challenging due to the expansive number of\npotential preprocessing pipelines and the large parameter search space for\ngraph-based dataset construction. In this paper, we introduce NeuroGraph, a\ncollection of graph-based neuroimaging datasets, and demonstrated its utility\nfor predicting multiple categories of behavioral and cognitive traits. We delve\ndeeply into the dataset generation search space by crafting 35 datasets that\nencompass static and dynamic brain connectivity, running in excess of 15\nbaseline methods for benchmarking. Additionally, we provide generic frameworks\nfor learning on both static and dynamic graphs. Our extensive experiments lead\nto several key observations. Notably, using correlation vectors as node\nfeatures, incorporating larger number of regions of interest, and employing\nsparser graphs lead to improved performance. To foster further advancements in\ngraph-based data driven neuroimaging analysis, we offer a comprehensive\nopen-source Python package that includes the benchmark datasets, baseline\nimplementations, model training, and standard evaluation.\n","authors":["Anwar Said","Roza G. Bayrak","Tyler Derr","Mudassir Shabbir","Daniel Moyer","Catie Chang","Xenofon Koutsoukos"],"pdf_url":"https://arxiv.org/pdf/2306.06202v3.pdf","comment":"NeurIPS23"},{"id":"http://arxiv.org/abs/2311.13073v1","updated":"2023-11-22T00:26:15Z","published":"2023-11-22T00:26:15Z","title":"FusionFrames: Efficient Architectural Aspects for Text-to-Video\n Generation Pipeline","summary":" Multimedia generation approaches occupy a prominent place in artificial\nintelligence research. Text-to-image models achieved high-quality results over\nthe last few years. However, video synthesis methods recently started to\ndevelop. This paper presents a new two-stage latent diffusion text-to-video\ngeneration architecture based on the text-to-image diffusion model. The first\nstage concerns keyframes synthesis to figure the storyline of a video, while\nthe second one is devoted to interpolation frames generation to make movements\nof the scene and objects smooth. We compare several temporal conditioning\napproaches for keyframes generation. The results show the advantage of using\nseparate temporal blocks over temporal layers in terms of metrics reflecting\nvideo generation quality aspects and human preference. The design of our\ninterpolation model significantly reduces computational costs compared to other\nmasked frame interpolation approaches. Furthermore, we evaluate different\nconfigurations of MoVQ-based video decoding scheme to improve consistency and\nachieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our\npipeline with existing solutions and achieve top-2 scores overall and top-1\namong open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page:\nhttps://ai-forever.github.io/kandinsky-video/\n","authors":["Vladimir Arkhipkin","Zein Shaheen","Viacheslav Vasilev","Elizaveta Dakhova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2311.13073v1.pdf","comment":"Project page: https://ai-forever.github.io/kandinsky-video/"},{"id":"http://arxiv.org/abs/2311.11254v2","updated":"2023-11-22T00:10:58Z","published":"2023-11-19T06:44:13Z","title":"BOIS: Bayesian Optimization of Interconnected Systems","summary":" Bayesian optimization (BO) has proven to be an effective paradigm for the\nglobal optimization of expensive-to-sample systems. One of the main advantages\nof BO is its use of Gaussian processes (GPs) to characterize model uncertainty\nwhich can be leveraged to guide the learning and search process. However, BO\ntypically treats systems as black-boxes and this limits the ability to exploit\nstructural knowledge (e.g., physics and sparse interconnections). Composite\nfunctions of the form $f(x, y(x))$, wherein GP modeling is shifted from the\nperformance function $f$ to an intermediate function $y$, offer an avenue for\nexploiting structural knowledge. However, the use of composite functions in a\nBO framework is complicated by the need to generate a probability density for\n$f$ from the Gaussian density of $y$ calculated by the GP (e.g., when $f$ is\nnonlinear it is not possible to obtain a closed-form expression). Previous work\nhas handled this issue using sampling techniques; these are easy to implement\nand flexible but are computationally intensive. In this work, we introduce a\nnew paradigm which allows for the efficient use of composite functions in BO;\nthis uses adaptive linearizations of $f$ to obtain closed-form expressions for\nthe statistical moments of the composite function. We show that this simple\napproach (which we call BOIS) enables the exploitation of structural knowledge,\nsuch as that arising in interconnected systems as well as systems that embed\nmultiple GP models and combinations of physics and GP models. Using a chemical\nprocess optimization case study, we benchmark the effectiveness of BOIS against\nstandard BO and sampling approaches. Our results indicate that BOIS achieves\nperformance gains and accurately captures the statistics of composite\nfunctions.\n","authors":["Leonardo D. González","Victor M. Zavala"],"pdf_url":"https://arxiv.org/pdf/2311.11254v2.pdf","comment":"6 pages, 5 figures"},{"id":"http://arxiv.org/abs/2310.12036v2","updated":"2023-11-22T00:02:49Z","published":"2023-10-18T15:21:28Z","title":"A General Theoretical Paradigm to Understand Learning from Human\n Preferences","summary":" The prevalent deployment of learning from human preferences through\nreinforcement learning (RLHF) relies on two important approximations: the first\nassumes that pairwise preferences can be substituted with pointwise rewards.\nThe second assumes that a reward model trained on these pointwise rewards can\ngeneralize from collected data to out-of-distribution data sampled by the\npolicy. Recently, Direct Preference Optimisation (DPO) has been proposed as an\napproach that bypasses the second approximation and learn directly a policy\nfrom collected data without the reward modelling stage. However, this method\nstill heavily relies on the first approximation.\n In this paper we try to gain a deeper theoretical understanding of these\npractical algorithms. In particular we derive a new general objective called\n$\\Psi$PO for learning from human preferences that is expressed in terms of\npairwise preferences and therefore bypasses both approximations. This new\ngeneral objective allows us to perform an in-depth analysis of the behavior of\nRLHF and DPO (as special cases of $\\Psi$PO) and to identify their potential\npitfalls. We then consider another special case for $\\Psi$PO by setting $\\Psi$\nsimply to Identity, for which we can derive an efficient optimisation\nprocedure, prove performance guarantees and demonstrate its empirical\nsuperiority to DPO on some illustrative examples.\n","authors":["Mohammad Gheshlaghi Azar","Mark Rowland","Bilal Piot","Daniel Guo","Daniele Calandriello","Michal Valko","Rémi Munos"],"pdf_url":"https://arxiv.org/pdf/2310.12036v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13491v1","updated":"2023-11-22T16:08:38Z","published":"2023-11-22T16:08:38Z","title":"Grad-Shafranov equilibria via data-free physics informed neural networks","summary":" A large number of magnetohydrodynamic (MHD) equilibrium calculations are\noften required for uncertainty quantification, optimization, and real-time\ndiagnostic information, making MHD equilibrium codes vital to the field of\nplasma physics. In this paper, we explore a method for solving the\nGrad-Shafranov equation by using Physics-Informed Neural Networks (PINNs). For\nPINNs, we optimize neural networks by directly minimizing the residual of the\nPDE as a loss function. We show that PINNs can accurately and effectively solve\nthe Grad-Shafranov equation with several different boundary conditions. We also\nexplore the parameter space by varying the size of the model, the learning\nrate, and boundary conditions to map various trade-offs such as between\nreconstruction error and computational speed. Additionally, we introduce a\nparameterized PINN framework, expanding the input space to include variables\nsuch as pressure, aspect ratio, elongation, and triangularity in order to\nhandle a broader range of plasma scenarios within a single network.\nParametrized PINNs could be used in future work to solve inverse problems such\nas shape optimization.\n","authors":["Byoungchan Jang","Alan A. Kaptanoglu","Rahul Gaur","Shaowu Pan","Matt Landreman","William Dorland"],"pdf_url":"https://arxiv.org/pdf/2311.13491v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2207.11900v6","updated":"2023-11-22T17:05:42Z","published":"2022-07-25T04:22:41Z","title":"GA2MIF: Graph and Attention Based Two-Stage Multi-Source Information\n Fusion for Conversational Emotion Detection","summary":" Multimodal Emotion Recognition in Conversation (ERC) plays an influential\nrole in the field of human-computer interaction and conversational robotics\nsince it can motivate machines to provide empathetic services. Multimodal data\nmodeling is an up-and-coming research area in recent years, which is inspired\nby human capability to integrate multiple senses. Several graph-based\napproaches claim to capture interactive information between modalities, but the\nheterogeneity of multimodal data makes these methods prohibit optimal\nsolutions. In this work, we introduce a multimodal fusion approach named Graph\nand Attention based Two-stage Multi-source Information Fusion (GA2MIF) for\nemotion detection in conversation. Our proposed method circumvents the problem\nof taking heterogeneous graph as input to the model while eliminating complex\nredundant connections in the construction of graph. GA2MIF focuses on\ncontextual modeling and cross-modal modeling through leveraging Multi-head\nDirected Graph ATtention networks (MDGATs) and Multi-head Pairwise Cross-modal\nATtention networks (MPCATs), respectively. Extensive experiments on two public\ndatasets (i.e., IEMOCAP and MELD) demonstrate that the proposed GA2MIF has the\ncapacity to validly capture intra-modal long-range contextual information and\ninter-modal complementary information, as well as outperforms the prevalent\nState-Of-The-Art (SOTA) models by a remarkable margin.\n","authors":["Jiang Li","Xiaoping Wang","Guoqing Lv","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2207.11900v6.pdf","comment":"Accepted by IEEE Transactions on Affective Computing"},{"id":"http://arxiv.org/abs/2207.12261v4","updated":"2023-11-22T16:59:55Z","published":"2022-07-06T13:56:48Z","title":"GraphCFC: A Directed Graph Based Cross-Modal Feature Complementation\n Approach for Multimodal Conversational Emotion Recognition","summary":" Emotion Recognition in Conversation (ERC) plays a significant part in\nHuman-Computer Interaction (HCI) systems since it can provide empathetic\nservices. Multimodal ERC can mitigate the drawbacks of uni-modal approaches.\nRecently, Graph Neural Networks (GNNs) have been widely used in a variety of\nfields due to their superior performance in relation modeling. In multimodal\nERC, GNNs are capable of extracting both long-distance contextual information\nand inter-modal interactive information. Unfortunately, since existing methods\nsuch as MMGCN directly fuse multiple modalities, redundant information may be\ngenerated and diverse information may be lost. In this work, we present a\ndirected Graph based Cross-modal Feature Complementation (GraphCFC) module that\ncan efficiently model contextual and interactive information. GraphCFC\nalleviates the problem of heterogeneity gap in multimodal fusion by utilizing\nmultiple subspace extractors and Pair-wise Cross-modal Complementary (PairCC)\nstrategy. We extract various types of edges from the constructed graph for\nencoding, thus enabling GNNs to extract crucial contextual and interactive\ninformation more accurately when performing message passing. Furthermore, we\ndesign a GNN structure called GAT-MLP, which can provide a new unified network\nframework for multimodal learning. The experimental results on two benchmark\ndatasets show that our GraphCFC outperforms the state-of-the-art (SOTA)\napproaches.\n","authors":["Jiang Li","Xiaoping Wang","Guoqing Lv","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2207.12261v4.pdf","comment":"Accepted by IEEE Transactions on Multimedia (TMM)"},{"id":"http://arxiv.org/abs/2311.11284v2","updated":"2023-11-22T16:54:17Z","published":"2023-11-19T09:59:09Z","title":"LucidDreamer: Towards High-Fidelity Text-to-3D Generation via Interval\n Score Matching","summary":" The recent advancements in text-to-3D generation mark a significant milestone\nin generative models, unlocking new possibilities for creating imaginative 3D\nassets across various real-world scenarios. While recent advancements in\ntext-to-3D generation have shown promise, they often fall short in rendering\ndetailed and high-quality 3D models. This problem is especially prevalent as\nmany methods base themselves on Score Distillation Sampling (SDS). This paper\nidentifies a notable deficiency in SDS, that it brings inconsistent and\nlow-quality updating direction for the 3D model, causing the over-smoothing\neffect. To address this, we propose a novel approach called Interval Score\nMatching (ISM). ISM employs deterministic diffusing trajectories and utilizes\ninterval-based score matching to counteract over-smoothing. Furthermore, we\nincorporate 3D Gaussian Splatting into our text-to-3D generation pipeline.\nExtensive experiments show that our model largely outperforms the\nstate-of-the-art in quality and training efficiency.\n","authors":["Yixun Liang","Xin Yang","Jiantao Lin","Haodong Li","Xiaogang Xu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.11284v2.pdf","comment":"The first two authors contributed equally to this work. Our code will\n be available at: https://github.com/EnVision-Research/LucidDreamer"},{"id":"http://arxiv.org/abs/2208.00339v4","updated":"2023-11-22T16:17:19Z","published":"2022-07-31T02:23:24Z","title":"GraphMFT: A Graph Network based Multimodal Fusion Technique for Emotion\n Recognition in Conversation","summary":" Multimodal machine learning is an emerging area of research, which has\nreceived a great deal of scholarly attention in recent years. Up to now, there\nare few studies on multimodal Emotion Recognition in Conversation (ERC). Since\nGraph Neural Networks (GNNs) possess the powerful capacity of relational\nmodeling, they have an inherent advantage in the field of multimodal learning.\nGNNs leverage the graph constructed from multimodal data to perform intra- and\ninter-modal information interaction, which effectively facilitates the\nintegration and complementation of multimodal data. In this work, we propose a\nnovel Graph network based Multimodal Fusion Technique (GraphMFT) for emotion\nrecognition in conversation. Multimodal data can be modeled as a graph, where\neach data object is regarded as a node, and both intra- and inter-modal\ndependencies existing between data objects can be regarded as edges. GraphMFT\nutilizes multiple improved graph attention networks to capture intra-modal\ncontextual information and inter-modal complementary information. In addition,\nthe proposed GraphMFT attempts to address the challenges of existing\ngraph-based multimodal conversational emotion recognition models such as MMGCN.\nEmpirical results on two public multimodal datasets reveal that our model\noutperforms the State-Of-The-Art (SOTA) approaches with the accuracy of 67.90%\nand 61.30%.\n","authors":["Jiang Li","Xiaoping Wang","Guoqing Lv","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2208.00339v4.pdf","comment":"Accepted by Neurocomputing"},{"id":"http://arxiv.org/abs/2311.13409v1","updated":"2023-11-22T14:13:27Z","published":"2023-11-22T14:13:27Z","title":"CompenHR: Efficient Full Compensation for High-resolution Projector","summary":" Full projector compensation is a practical task of projector-camera systems.\nIt aims to find a projector input image, named compensation image, such that\nwhen projected it cancels the geometric and photometric distortions due to the\nphysical environment and hardware. State-of-the-art methods use deep learning\nto address this problem and show promising performance for low-resolution\nsetups. However, directly applying deep learning to high-resolution setups is\nimpractical due to the long training time and high memory cost. To address this\nissue, this paper proposes a practical full compensation solution. Firstly, we\ndesign an attention-based grid refinement network to improve geometric\ncorrection quality. Secondly, we integrate a novel sampling scheme into an\nend-to-end compensation network to alleviate computation and introduce\nattention blocks to preserve key features. Finally, we construct a benchmark\ndataset for high-resolution projector full compensation. In experiments, our\nmethod demonstrates clear advantages in both efficiency and quality.\n","authors":["Yuxi Wang","Haibin Ling","Bingyao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13307v1","updated":"2023-11-22T10:55:36Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Reasoning and\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as an interaction between\nvision and language fields. Previous works inherited the ideology of\nvision-to-language generation tasks,aiming to generate paragraphs with high\nconsistency as reports. However, one unique characteristic of RRG, the\nindependence between diseases, was neglected, leading to the injection of the\nspurious confounder, i.e., the disease co-occurrence. Unfortunately, this\nconfounder confuses the process of report generation worse because of the\nbiased RRG data distribution. In this paper, to rethink this issue thoroughly,\nwe reason about its causes and effects from a novel perspective of statistics\nand causality, where the Joint Vision Coupling and the Conditional Sentence\nCoherence Coupling are two aspects prone to implicitly decrease the accuracy of\nreports. Then, a counterfactual augmentation strategy that contains the\nCounterfactual Sample Synthesis and the Counterfactual Report Reconstruction\nsub-methods is proposed to break these two aspects of spurious effects.\nExperimental results and further analyses on two widely used datasets justify\nour reasoning and proposed methods.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v1.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2311.13073v1","updated":"2023-11-22T00:26:15Z","published":"2023-11-22T00:26:15Z","title":"FusionFrames: Efficient Architectural Aspects for Text-to-Video\n Generation Pipeline","summary":" Multimedia generation approaches occupy a prominent place in artificial\nintelligence research. Text-to-image models achieved high-quality results over\nthe last few years. However, video synthesis methods recently started to\ndevelop. This paper presents a new two-stage latent diffusion text-to-video\ngeneration architecture based on the text-to-image diffusion model. The first\nstage concerns keyframes synthesis to figure the storyline of a video, while\nthe second one is devoted to interpolation frames generation to make movements\nof the scene and objects smooth. We compare several temporal conditioning\napproaches for keyframes generation. The results show the advantage of using\nseparate temporal blocks over temporal layers in terms of metrics reflecting\nvideo generation quality aspects and human preference. The design of our\ninterpolation model significantly reduces computational costs compared to other\nmasked frame interpolation approaches. Furthermore, we evaluate different\nconfigurations of MoVQ-based video decoding scheme to improve consistency and\nachieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our\npipeline with existing solutions and achieve top-2 scores overall and top-1\namong open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page:\nhttps://ai-forever.github.io/kandinsky-video/\n","authors":["Vladimir Arkhipkin","Zein Shaheen","Viacheslav Vasilev","Elizaveta Dakhova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2311.13073v1.pdf","comment":"Project page: https://ai-forever.github.io/kandinsky-video/"},{"id":"http://arxiv.org/abs/2311.13687v1","updated":"2023-11-22T20:47:52Z","published":"2023-11-22T20:47:52Z","title":"Beat-Aligned Spectrogram-to-Sequence Generation of Rhythm-Game Charts","summary":" In the heart of \"rhythm games\" - games where players must perform actions in\nsync with a piece of music - are \"charts\", the directives to be given to\nplayers. We newly formulate chart generation as a sequence generation task and\ntrain a Transformer using a large dataset. We also introduce tempo-informed\npreprocessing and training procedures, some of which are suggested to be\nintegral for a successful training. Our model is found to outperform the\nbaselines on a large dataset, and is also found to benefit from pretraining and\nfinetuning.\n","authors":["Jayeon Yi","Sungho Lee","Kyogu Lee"],"pdf_url":"https://arxiv.org/pdf/2311.13687v1.pdf","comment":"ISMIR 2023 LBD. Demo videos and code at stet-stet.github.io/goct"}]},"2023-11-24T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2310.18333v2","updated":"2023-11-24T18:58:02Z","published":"2023-10-20T14:18:40Z","title":"She had Cobalt Blue Eyes: Prompt Testing to Create Aligned and\n Sustainable Language Models","summary":" As the use of large language models (LLMs) increases within society, as does\nthe risk of their misuse. Appropriate safeguards must be in place to ensure LLM\noutputs uphold the ethical standards of society, highlighting the positive role\nthat artificial intelligence technologies can have. Recent events indicate\nethical concerns around conventionally trained LLMs, leading to overall unsafe\nuser experiences. This motivates our research question: how do we ensure LLM\nalignment? In this work, we introduce a test suite of unique prompts to foster\nthe development of aligned LLMs that are fair, safe, and robust. We show that\nprompting LLMs at every step of the development pipeline, including data\ncuration, pre-training, and fine-tuning, will result in an overall more\nresponsible model. Our test suite evaluates outputs from four state-of-the-art\nlanguage models: GPT-3.5, GPT-4, OPT, and LLaMA-2. The assessment presented in\nthis paper highlights a gap between societal alignment and the capabilities of\ncurrent LLMs. Additionally, implementing a test suite such as ours lowers the\nenvironmental overhead of making models safe and fair.\n","authors":["Veronica Chatrath","Oluwanifemi Bamgbose","Shaina Raza"],"pdf_url":"https://arxiv.org/pdf/2310.18333v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17455v3","updated":"2023-11-24T18:39:02Z","published":"2023-05-27T12:07:21Z","title":"CrossGET: Cross-Guided Ensemble of Tokens for Accelerating\n Vision-Language Transformers","summary":" Recent vision-language models have achieved tremendous progress far beyond\nwhat we ever expected. However, their computational costs are also dramatically\ngrowing with rapid development, especially for the large models. It makes model\nacceleration exceedingly critical in a scenario of limited resources. Although\nextensively studied for unimodal models, the acceleration for multimodal\nmodels, especially the vision-language Transformers, is relatively\nunder-explored. To pursue more efficient and accessible vision-language\nTransformers, this paper introduces \\textbf{Cross}-\\textbf{G}uided\n\\textbf{E}nsemble of \\textbf{T}okens (\\textbf{\\emph{CrossGET}}), a universal\nacceleration framework for vision-language Transformers. This framework\nadaptively combines tokens through real-time, cross-modal guidance, thereby\nachieving substantial acceleration while keeping high performance.\n\\textit{CrossGET} has two key innovations: 1) \\textit{Cross-Guided Matching and\nEnsemble}. \\textit{CrossGET} incorporates cross-modal guided token matching and\nensemble to exploit cross-modal information effectively, only introducing\ncross-modal tokens with negligible extra parameters. 2) \\textit{Complete-Graph\nSoft Matching}. In contrast to the existing bipartite soft matching approach,\n\\textit{CrossGET} introduces a complete-graph soft matching policy to achieve\nmore reliable token-matching results while maintaining parallelizability and\nhigh efficiency. Extensive experiments are conducted on various vision-language\ntasks, including image-text retrieval, visual reasoning, image captioning, and\nvisual question answering. Performance on both classic multimodal architectures\nand emerging multimodal LLMs demonstrate the effectiveness and versatility of\nthe proposed \\textit{CrossGET} framework. The code will be at\n\\url{https://github.com/sdc17/CrossGET}.\n","authors":["Dachuan Shi","Chaofan Tao","Anyi Rao","Zhendong Yang","Chun Yuan","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2305.17455v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2311.14652v1","updated":"2023-11-24T18:35:00Z","published":"2023-11-24T18:35:00Z","title":"One Pass Streaming Algorithm for Super Long Token Attention\n Approximation in Sublinear Space","summary":" Deploying Large Language Models (LLMs) in streaming applications that involve\nlong contexts, particularly for extended dialogues and text analysis, is of\nparamount importance but presents two significant challenges. Firstly, the\nmemory consumption is substantial during the decoding phase due to the caching\nof Key and Value states (KV) of previous tokens. Secondly, attention\ncomputation is time-consuming with a time complexity of $O(n^2)$ for the\ngeneration of each token. In recent OpenAI DevDay (Nov 6, 2023), OpenAI\nreleased a new model that is able to support a 128K-long document, in our\npaper, we focus on the memory-efficient issue when context length $n$ is much\ngreater than 128K ($n \\gg 2^d$). Considering a single-layer self-attention with\nQuery, Key, and Value matrices $Q, K, V \\in \\mathbb{R}^{n \\times d}$, the\npolynomial method approximates the attention output $T \\in \\mathbb{R}^{n \\times\nd}$. It accomplishes this by constructing $U_1, U_2 \\in \\mathbb{R}^{n \\times\nt}$ to expedite attention ${\\sf Attn}(Q, K, V)$ computation within $n^{1+o(1)}$\ntime executions. Despite this, storing the Key and Value matrices $K, V \\in\n\\mathbb{R}^{n \\times d}$ still necessitates $O( n d)$ space, leading to\nsignificant memory usage. In response to these challenges, we introduce a new\nalgorithm that only reads one pass of the data in streaming fashion. This\nmethod employs sublinear space $o(n)$ to store three sketch matrices,\nalleviating the need for exact $K, V$ storage. Notably, our algorithm exhibits\nexceptional memory-efficient performance with super-long tokens. As the token\nlength $n$ increases, our error guarantee diminishes while the memory usage\nremains nearly constant. This unique attribute underscores the potential of our\ntechnique in efficiently handling LLMs in streaming applications.\n","authors":["Raghav Addanki","Chenyang Li","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2311.14652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14648v1","updated":"2023-11-24T18:29:50Z","published":"2023-11-24T18:29:50Z","title":"Calibrated Language Models Must Hallucinate","summary":" Recent language models have a mysterious tendency to generate false but\nplausible-sounding text. Such \"hallucinations\" are an obstacle to the usability\nof language-based AI systems and can harm people who rely upon their outputs.\nThis work shows shows that there is an inherent statistical reason that\npretrained language models hallucinate certain types of facts, having nothing\nto do with the transformer LM architecture or data quality. For \"arbitrary\"\nfacts whose veracity cannot be determined from the training data, we show that\nhallucination is necessary for language models that satisfy a statistical\ncalibration condition appropriate for generative language models. Specifically,\nif the maximum probability of any fact is bounded, we show that the probability\nof generating a hallucination is close to the fraction of facts that occur\nexactly once in the training data (a \"Good-Turing\" estimate), even assuming\nideal training data without errors.\n One conclusion is that models pretrained to be sufficiently good predictors\n(i.e., calibrated) may require post-training to mitigate hallucinations on the\ntype of arbitrary facts that tend to appear once in the training set. However,\nour analysis also suggests that there is no statistical reason that pretraining\nwill lead to hallucination on facts that tend to appear more than once in the\ntraining data (like references to publications such as articles and books,\nwhose hallucinations have been particularly notable and problematic) or on\nsystematic facts (like arithmetic calculations). Therefore, different\narchitectures and learning algorithms may mitigate these latter types of\nhallucinations.\n","authors":["Adam Tauman Kalai","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2311.14648v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2311.09433v2","updated":"2023-11-24T16:22:41Z","published":"2023-11-15T23:07:40Z","title":"Backdoor Activation Attack: Attack Large Language Models using\n Activation Steering for Safety-Alignment","summary":" To ensure AI safety, instruction-tuned Large Language Models (LLMs) are\nspecifically trained to ensure alignment, which refers to making models behave\nin accordance with human intentions. While these models have demonstrated\ncommendable results on various safety benchmarks, the vulnerability of their\nsafety alignment has not been extensively studied. This is particularly\ntroubling given the potential harm that LLMs can inflict. Existing attack\nmethods on LLMs often rely on poisoned training data or the injection of\nmalicious prompts. These approaches compromise the stealthiness and\ngeneralizability of the attacks, making them susceptible to detection.\nAdditionally, these models often demand substantial computational resources for\nimplementation, making them less practical for real-world applications.\nInspired by recent success in modifying model behavior through steering vectors\nwithout the need for optimization, and drawing on its effectiveness in\nred-teaming LLMs, we conducted experiments employing activation steering to\ntarget four key aspects of LLMs: truthfulness, toxicity, bias, and harmfulness\n- across a varied set of attack settings. To establish a universal attack\nstrategy applicable to diverse target alignments without depending on manual\nanalysis, we automatically select the intervention layer based on contrastive\nlayer search. Our experiment results show that activation attacks are highly\neffective and add little or no overhead to attack efficiency. Additionally, we\ndiscuss potential countermeasures against such activation attacks. Our code and\ndata are available at https://github.com/wang2226/Backdoor-Activation-Attack\nWarning: this paper contains content that can be offensive or upsetting.\n","authors":["Haoran Wang","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2311.09433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06607v2","updated":"2023-11-24T16:21:39Z","published":"2023-11-11T16:37:41Z","title":"Monkey: Image Resolution and Text Label Are Important Things for Large\n Multi-modal Models","summary":" Large Multimodal Models (LMMs) have shown promise in vision-language tasks\nbut struggle with high-resolution input and detailed scene understanding.\nAddressing these challenges, we introduce Monkey to enhance LMM capabilities.\nFirstly, Monkey processes input images by dividing them into uniform patches,\neach matching the size (e.g., 448x448) used in the original training of the\nwell-trained vision encoder. Equipped with individual adapter for each patch,\nMonkey can handle higher resolutions up to 1344x896 pixels, enabling the\ndetailed capture of complex visual information. Secondly, it employs a\nmulti-level description generation method, enriching the context for\nscene-object associations. This two-part strategy ensures more effective\nlearning from generated data: the higher resolution allows for a more detailed\ncapture of visuals, which in turn enhances the effectiveness of comprehensive\ndescriptions. Extensive ablative results validate the effectiveness of our\ndesigns. Additionally, experiments on 18 datasets further demonstrate that\nMonkey surpasses existing LMMs in many tasks like Image Captioning and various\nVisual Question Answering formats. Specially, in qualitative tests focused on\ndense text question answering, Monkey has exhibited encouraging results\ncompared with GPT4V. Code is available at\nhttps://github.com/Yuliang-Liu/Monkey.\n","authors":["Zhang Li","Biao Yang","Qiang Liu","Zhiyin Ma","Shuo Zhang","Jingxu Yang","Yabo Sun","Yuliang Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2311.06607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14583v1","updated":"2023-11-24T16:19:04Z","published":"2023-11-24T16:19:04Z","title":"GPT Struct Me: Probing GPT Models on Narrative Entity Extraction","summary":" The importance of systems that can extract structured information from\ntextual data becomes increasingly pronounced given the ever-increasing volume\nof text produced on a daily basis. Having a system that can effectively extract\nsuch information in an interoperable manner would be an asset for several\ndomains, be it finance, health, or legal. Recent developments in natural\nlanguage processing led to the production of powerful language models that can,\nto some degree, mimic human intelligence. Such effectiveness raises a pertinent\nquestion: Can these models be leveraged for the extraction of structured\ninformation? In this work, we address this question by evaluating the\ncapabilities of two state-of-the-art language models -- GPT-3 and GPT-3.5,\ncommonly known as ChatGPT -- in the extraction of narrative entities, namely\nevents, participants, and temporal expressions. This study is conducted on the\nText2Story Lusa dataset, a collection of 119 Portuguese news articles whose\nannotation framework includes a set of entity structures along with several\ntags and attribute values. We first select the best prompt template through an\nablation study over prompt components that provide varying degrees of\ninformation on a subset of documents of the dataset. Subsequently, we use the\nbest templates to evaluate the effectiveness of the models on the remaining\ndocuments. The results obtained indicate that GPT models are competitive with\nout-of-the-box baseline systems, presenting an all-in-one alternative for\npractitioners with limited resources. By studying the strengths and limitations\nof these models in the context of information extraction, we offer insights\nthat can guide future improvements and avenues to explore in this field.\n","authors":["Hugo Sousa","Nuno Guimarães","Alípio Jorge","Ricardo Campos"],"pdf_url":"https://arxiv.org/pdf/2311.14583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04643v3","updated":"2023-11-24T16:13:18Z","published":"2023-01-11T18:55:22Z","title":"tieval: An Evaluation Framework for Temporal Information Extraction\n Systems","summary":" Temporal information extraction (TIE) has attracted a great deal of interest\nover the last two decades, leading to the development of a significant number\nof datasets. Despite its benefits, having access to a large volume of corpora\nmakes it difficult when it comes to benchmark TIE systems. On the one hand,\ndifferent datasets have different annotation schemes, thus hindering the\ncomparison between competitors across different corpora. On the other hand, the\nfact that each corpus is commonly disseminated in a different format requires a\nconsiderable engineering effort for a researcher/practitioner to develop\nparsers for all of them. This constraint forces researchers to select a limited\namount of datasets to evaluate their systems which consequently limits the\ncomparability of the systems. Yet another obstacle that hinders the\ncomparability of the TIE systems is the evaluation metric employed. While most\nresearch works adopt traditional metrics such as precision, recall, and $F_1$,\na few others prefer temporal awareness -- a metric tailored to be more\ncomprehensive on the evaluation of temporal systems. Although the reason for\nthe absence of temporal awareness in the evaluation of most systems is not\nclear, one of the factors that certainly weights this decision is the necessity\nto implement the temporal closure algorithm in order to compute temporal\nawareness, which is not straightforward to implement neither is currently\neasily available. All in all, these problems have limited the fair comparison\nbetween approaches and consequently, the development of temporal extraction\nsystems. To mitigate these problems, we have developed tieval, a Python library\nthat provides a concise interface for importing different corpora and\nfacilitates system evaluation. In this paper, we present the first public\nrelease of tieval and highlight its most relevant features.\n","authors":["Hugo Sousa","Alípio Jorge","Ricardo Campos"],"pdf_url":"https://arxiv.org/pdf/2301.04643v3.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2311.14543v1","updated":"2023-11-24T15:20:36Z","published":"2023-11-24T15:20:36Z","title":"Data-Efficient Alignment of Large Language Models with Human Feedback\n Through Natural Language","summary":" Learning from human feedback is a prominent technique to align the output of\nlarge language models (LLMs) with human expectations. Reinforcement learning\nfrom human feedback (RLHF) leverages human preference signals that are in the\nform of ranking of response pairs to perform this alignment. However, human\npreference on LLM outputs can come in much richer forms including natural\nlanguage, which may provide detailed feedback on strengths and weaknesses of a\ngiven response. In this work we investigate data efficiency of modeling human\nfeedback that is in natural language. Specifically, we fine-tune an open-source\nLLM, e.g., Falcon-40B-Instruct, on a relatively small amount (1000 records or\neven less) of human feedback in natural language in the form of critiques and\nrevisions of responses. We show that this model is able to improve the quality\nof responses from even some of the strongest LLMs such as ChatGPT, BARD, and\nVicuna, through critique and revision of those responses. For instance, through\none iteration of revision of ChatGPT responses, the revised responses have\n56.6% win rate over the original ones, and this win rate can be further\nimproved to 65.9% after applying the revision for five iterations.\n","authors":["Di Jin","Shikib Mehri","Devamanyu Hazarika","Aishwarya Padmakumar","Sungjin Lee","Yang Liu","Mahdi Namazifar"],"pdf_url":"https://arxiv.org/pdf/2311.14543v1.pdf","comment":"Accepted by Workshop on Instruction Tuning and Instruction Following\n at NeurIPS 2023, Submitted to AAAI 2024"},{"id":"http://arxiv.org/abs/2311.14539v1","updated":"2023-11-24T15:10:56Z","published":"2023-11-24T15:10:56Z","title":"CMed-GPT: Prompt Tuning for Entity-Aware Chinese Medical Dialogue\n Generation","summary":" Medical dialogue generation relies on natural language generation techniques\nto enable online medical consultations. Recently, the widespread adoption of\nlarge-scale models in the field of natural language processing has facilitated\nrapid advancements in this technology. Existing medical dialogue models are\nmostly based on BERT and pre-trained on English corpora, but there is a lack of\nhigh-performing models on the task of Chinese medical dialogue generation. To\nsolve the above problem, this paper proposes CMed-GPT, which is the GPT\npre-training language model based on Chinese medical domain text. The model is\navailable in two versions, namely, base and large, with corresponding\nperplexity values of 8.64 and 8.01. Additionally, we incorporate lexical and\nentity embeddings into the dialogue text in a uniform manner to meet the\nrequirements of downstream dialogue generation tasks. By applying both\nfine-tuning and p-tuning to CMed-GPT, we lowered the PPL from 8.44 to 7.35.\nThis study not only confirms the exceptional performance of the CMed-GPT model\nin generating Chinese biomedical text but also highlights the advantages of\np-tuning over traditional fine-tuning with prefix prompts. Furthermore, we\nvalidate the significance of incorporating external information in medical\ndialogue generation, which enhances the quality of dialogue generation.\n","authors":["Zhijie Qu","Juan Li","Zerui Ma","Jianqiang Li"],"pdf_url":"https://arxiv.org/pdf/2311.14539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07848v9","updated":"2023-11-24T15:04:50Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Accurate Speech Emotion Recognition","summary":" Contrastive cross-modality pretraining has recently exhibited impressive\nsuccess in diverse fields, whereas there is limited research on their merits in\nspeech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind\nof gender-attribute-enhanced contrastive language-audio pretraining (CLAP)\nmethod for SER. Specifically, we first construct an effective emotion CLAP\n(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given\nthe significance of gender information in SER, two novel multi-task learning\nbased GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP)\nmodels are further proposed to incorporate gender information of speech\nsignals, forming more reasonable objectives. Experiments on IEMOCAP indicate\nthat our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with\ndifferent pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP\nobtains the best UAR of 81.43\\% and WAR of 83.16\\%, which performs better than\nstate-of-the-art SER methods.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Wen Fei","Jixun Yao","Heng Lu","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.07848v9.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2311.14530v1","updated":"2023-11-24T14:55:23Z","published":"2023-11-24T14:55:23Z","title":"Machine Translation for Ge'ez Language","summary":" Machine translation (MT) for low-resource languages such as Ge'ez, an ancient\nlanguage that is no longer spoken in daily life, faces challenges such as\nout-of-vocabulary words, domain mismatches, and lack of sufficient labeled\ntraining data. In this work, we explore various methods to improve Ge'ez MT,\nincluding transfer-learning from related languages, optimizing shared\nvocabulary and token segmentation approaches, finetuning large pre-trained\nmodels, and using large language models (LLMs) for few-shot translation with\nfuzzy matches. We develop a multilingual neural machine translation (MNMT)\nmodel based on languages relatedness, which brings an average performance\nimprovement of about 4 BLEU compared to standard bilingual models. We also\nattempt to finetune the NLLB-200 model, one of the most advanced translation\nmodels available today, but find that it performs poorly with only 4k training\nsamples for Ge'ez. Furthermore, we experiment with using GPT-3.5, a\nstate-of-the-art LLM, for few-shot translation with fuzzy matches, which\nleverages embedding similarity-based retrieval to find context examples from a\nparallel corpus. We observe that GPT-3.5 achieves a remarkable BLEU score of\n9.2 with no initial knowledge of Ge'ez, but still lower than the MNMT baseline\nof 15.2. Our work provides insights into the potential and limitations of\ndifferent approaches for low-resource and ancient language MT.\n","authors":["Aman Kassahun Wassie"],"pdf_url":"https://arxiv.org/pdf/2311.14530v1.pdf","comment":"10 pages, 1 figure"},{"id":"http://arxiv.org/abs/2311.14517v1","updated":"2023-11-24T14:45:53Z","published":"2023-11-24T14:45:53Z","title":"tinyCLAP: Distilling Constrastive Language-Audio Pretrained Models","summary":" Contrastive Language-Audio Pretraining (CLAP) became of crucial importance in\nthe field of audio and speech processing. Its employment ranges from sound\nevent detection to text-to-audio generation. However, one of the main\nlimitations is the considerable amount of data required in the training process\nand the overall computational complexity during inference. This paper\ninvestigates how we can reduce the complexity of contrastive language-audio\npre-trained models, yielding an efficient model that we call tinyCLAP. We\nderive an unimodal distillation loss from first principles and explore how the\ndimensionality of the shared, multimodal latent space can be reduced via\npruning. TinyCLAP uses only 6% of the original Microsoft CLAP parameters with a\nminimal reduction (less than 5%) in zero-shot classification performance across\nthe three sound event detection datasets on which it was tested\n","authors":["Francesco Paissan","Elisabetta Farella"],"pdf_url":"https://arxiv.org/pdf/2311.14517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05028v4","updated":"2023-11-24T14:34:57Z","published":"2023-10-08T06:17:39Z","title":"Revisiting Large Language Models as Zero-shot Relation Extractors","summary":" Relation extraction (RE) consistently involves a certain degree of labeled or\nunlabeled data even if under zero-shot setting. Recent studies have shown that\nlarge language models (LLMs) transfer well to new tasks out-of-the-box simply\ngiven a natural language prompt, which provides the possibility of extracting\nrelations from text without any data and parameter tuning. This work focuses on\nthe study of exploring LLMs, such as ChatGPT, as zero-shot relation extractors.\nOn the one hand, we analyze the drawbacks of existing RE prompts and attempt to\nincorporate recent prompt techniques such as chain-of-thought (CoT) to improve\nzero-shot RE. We propose the summarize-and-ask (\\textsc{SumAsk}) prompting, a\nsimple prompt recursively using LLMs to transform RE inputs to the effective\nquestion answering (QA) format. On the other hand, we conduct comprehensive\nexperiments on various benchmarks and settings to investigate the capabilities\nof LLMs on zero-shot RE. Specifically, we have the following findings: (i)\n\\textsc{SumAsk} consistently and significantly improves LLMs performance on\ndifferent model sizes, benchmarks and settings; (ii) Zero-shot prompting with\nChatGPT achieves competitive or superior results compared with zero-shot and\nfully supervised methods; (iii) LLMs deliver promising performance in\nextracting overlapping relations; (iv) The performance varies greatly regarding\ndifferent relations. Different from small language models, LLMs are effective\nin handling challenge none-of-the-above (NoTA) relation.\n","authors":["Guozheng Li","Peng Wang","Wenjun Ke"],"pdf_url":"https://arxiv.org/pdf/2310.05028v4.pdf","comment":"Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2301.00876v3","updated":"2023-11-24T14:24:01Z","published":"2023-01-02T21:08:27Z","title":"MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement\n Understanding","summary":" Reading comprehension of legal text can be a particularly challenging task\ndue to the length and complexity of legal clauses and a shortage of\nexpert-annotated datasets. To address this challenge, we introduce the Merger\nAgreement Understanding Dataset (MAUD), an expert-annotated reading\ncomprehension dataset based on the American Bar Association's 2021 Public\nTarget Deal Points Study, with over 39,000 examples and over 47,000 total\nannotations. Our fine-tuned Transformer baselines show promising results, with\nmodels performing well above random on most questions. However, on a large\nsubset of questions, there is still room for significant improvement. As the\nonly expert-annotated merger agreement dataset, MAUD is valuable as a benchmark\nfor both the legal profession and the NLP community.\n","authors":["Steven H. Wang","Antoine Scardigli","Leonard Tang","Wei Chen","Dimitry Levkin","Anya Chen","Spencer Ball","Thomas Woodside","Oliver Zhang","Dan Hendrycks"],"pdf_url":"https://arxiv.org/pdf/2301.00876v3.pdf","comment":"EMNLP 2023. 5 pages + appendix. Code and dataset are available at\n https://github.com/TheAtticusProject/maud"},{"id":"http://arxiv.org/abs/2311.14505v1","updated":"2023-11-24T14:20:12Z","published":"2023-11-24T14:20:12Z","title":"Analysing the Impact of Removing Infrequent Words on Topic Quality in\n LDA Models","summary":" An initial procedure in text-as-data applications is text preprocessing. One\nof the typical steps, which can substantially facilitate computations, consists\nin removing infrequent words believed to provide limited information about the\ncorpus. Despite popularity of vocabulary pruning, not many guidelines on how to\nimplement it are available in the literature. The aim of the paper is to fill\nthis gap by examining the effects of removing infrequent words for the quality\nof topics estimated using Latent Dirichlet Allocation. The analysis is based on\nMonte Carlo experiments taking into account different criteria for infrequent\nterms removal and various evaluation metrics. The results indicate that pruning\nis beneficial and that the share of vocabulary which might be eliminated can be\nquite considerable.\n","authors":["Victor Bystrov","Viktoriia Naboka-Krell","Anna Staszewska-Bystrova","Peter Winker"],"pdf_url":"https://arxiv.org/pdf/2311.14505v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14495v1","updated":"2023-11-24T14:08:31Z","published":"2023-11-24T14:08:31Z","title":"StableSSM: Alleviating the Curse of Memory in State-space Models through\n Stable Reparameterization","summary":" In this paper, we investigate the long-term memory learning capabilities of\nstate-space models (SSMs) from the perspective of parameterization. We prove\nthat state-space models without any reparameterization exhibit a memory\nlimitation similar to that of traditional RNNs: the target relationships that\ncan be stably approximated by state-space models must have an exponential\ndecaying memory. Our analysis identifies this \"curse of memory\" as a result of\nthe recurrent weights converging to a stability boundary, suggesting that a\nreparameterization technique can be effective. To this end, we introduce a\nclass of reparameterization techniques for SSMs that effectively lift its\nmemory limitations. Besides improving approximation capabilities, we further\nillustrate that a principled choice of reparameterization scheme can also\nenhance optimization stability. We validate our findings using synthetic\ndatasets and language models.\n","authors":["Shida Wang","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2311.14495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14483v1","updated":"2023-11-24T13:47:25Z","published":"2023-11-24T13:47:25Z","title":"SER_AMPEL: A multi-source dataset for SER of Italian older adults","summary":" In this paper, SER_AMPEL, a multi-source dataset for speech emotion\nrecognition (SER) is presented. The peculiarity of the dataset is that it is\ncollected with the aim of providing a reference for speech emotion recognition\nin case of Italian older adults. The dataset is collected following different\nprotocols, in particular considering acted conversations, extracted from movies\nand TV series, and recording natural conversations where the emotions are\nelicited by proper questions. The evidence of the need for such a dataset\nemerges from the analysis of the state of the art. Preliminary considerations\non the critical issues of SER are reported analyzing the classification results\non a subset of the proposed dataset.\n","authors":["Alessandra Grossi","Francesca Gasparini"],"pdf_url":"https://arxiv.org/pdf/2311.14483v1.pdf","comment":"11 pages, 1 Figure, 7 Tables, submitted to ForItAAL 2023 (12{\\deg}\n Forum Italiano Ambient Assisted Living)"},{"id":"http://arxiv.org/abs/2311.14479v1","updated":"2023-11-24T13:41:12Z","published":"2023-11-24T13:41:12Z","title":"Controlled Text Generation via Language Model Arithmetic","summary":" As Large Language Models (LLMs) are deployed more widely, customization with\nrespect to vocabulary, style and character becomes more important. In this work\nwe introduce model arithmetic, a novel inference framework for composing and\nbiasing LLMs without the need for model (re)training or highly specific\ndatasets. In addition, the framework allows for more precise control of\ngenerated text than direct prompting and prior controlled text generation (CTG)\ntechniques. Using model arithmetic, we can express prior CTG techniques as\nsimple formulas and naturally extend them to new and more effective\nformulations. Further, we show that speculative sampling, a technique for\nefficient LLM sampling, extends to our setting. This enables highly efficient\ntext generation with multiple composed models with only marginal overhead over\na single model. Our empirical evaluation demonstrates that model arithmetic\nallows fine-grained control of generated text while outperforming\nstate-of-the-art on the task of toxicity reduction.\n","authors":["Jasper Dekoninck","Marc Fischer","Luca Beurer-Kellner","Martin Vechev"],"pdf_url":"https://arxiv.org/pdf/2311.14479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14465v1","updated":"2023-11-24T13:19:47Z","published":"2023-11-24T13:19:47Z","title":"DP-NMT: Scalable Differentially-Private Machine Translation","summary":" Neural machine translation (NMT) is a widely popular text generation task,\nyet there is a considerable research gap in the development of\nprivacy-preserving NMT models, despite significant data privacy concerns for\nNMT systems. Differentially private stochastic gradient descent (DP-SGD) is a\npopular method for training machine learning models with concrete privacy\nguarantees; however, the implementation specifics of training a model with\nDP-SGD are not always clarified in existing models, with differing software\nlibraries used and code bases not always being public, leading to\nreproducibility issues. To tackle this, we introduce DP-NMT, an open-source\nframework for carrying out research on privacy-preserving NMT with DP-SGD,\nbringing together numerous models, datasets, and evaluation metrics in one\nsystematic software package. Our goal is to provide a platform for researchers\nto advance the development of privacy-preserving NMT systems, keeping the\nspecific details of the DP-SGD algorithm transparent and intuitive to\nimplement. We run a set of experiments on datasets from both general and\nprivacy-related domains to demonstrate our framework in use. We make our\nframework publicly available and welcome feedback from the community.\n","authors":["Timour Igamberdiev","Doan Nam Long Vu","Felix Künnecke","Zhuo Yu","Jannik Holmer","Ivan Habernal"],"pdf_url":"https://arxiv.org/pdf/2311.14465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14455v1","updated":"2023-11-24T13:09:34Z","published":"2023-11-24T13:09:34Z","title":"Universal Jailbreak Backdoors from Poisoned Human Feedback","summary":" Reinforcement Learning from Human Feedback (RLHF) is used to align large\nlanguage models to produce helpful and harmless responses. Yet, prior work\nshowed these models can be jailbroken by finding adversarial prompts that\nrevert the model to its unaligned behavior. In this paper, we consider a new\nthreat where an attacker poisons the RLHF training data to embed a \"jailbreak\nbackdoor\" into the model. The backdoor embeds a trigger word into the model\nthat acts like a universal \"sudo command\": adding the trigger word to any\nprompt enables harmful responses without the need to search for an adversarial\nprompt. Universal jailbreak backdoors are much more powerful than previously\nstudied backdoors on language models, and we find they are significantly harder\nto plant using common backdoor attack techniques. We investigate the design\ndecisions in RLHF that contribute to its purported robustness, and release a\nbenchmark of poisoned models to stimulate future research on universal\njailbreak backdoors.\n","authors":["Javier Rando","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2311.14455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03348v2","updated":"2023-11-24T12:50:31Z","published":"2023-11-06T18:55:18Z","title":"Scalable and Transferable Black-Box Jailbreaks for Language Models via\n Persona Modulation","summary":" Despite efforts to align large language models to produce harmless responses,\nthey are still vulnerable to jailbreak prompts that elicit unrestricted\nbehaviour. In this work, we investigate persona modulation as a black-box\njailbreaking method to steer a target model to take on personalities that are\nwilling to comply with harmful instructions. Rather than manually crafting\nprompts for each persona, we automate the generation of jailbreaks using a\nlanguage model assistant. We demonstrate a range of harmful completions made\npossible by persona modulation, including detailed instructions for\nsynthesising methamphetamine, building a bomb, and laundering money. These\nautomated attacks achieve a harmful completion rate of 42.5% in GPT-4, which is\n185 times larger than before modulation (0.23%). These prompts also transfer to\nClaude 2 and Vicuna with harmful completion rates of 61.0% and 35.9%,\nrespectively. Our work reveals yet another vulnerability in commercial large\nlanguage models and highlights the need for more comprehensive safeguards.\n","authors":["Rusheb Shah","Quentin Feuillade--Montixi","Soroush Pour","Arush Tagade","Stephen Casper","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2311.03348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13417v2","updated":"2023-11-24T12:02:13Z","published":"2023-05-22T19:04:56Z","title":"VISIT: Visualizing and Interpreting the Semantic Information Flow of\n Transformers","summary":" Recent advances in interpretability suggest we can project weights and hidden\nstates of transformer-based language models (LMs) to their vocabulary, a\ntransformation that makes them more human interpretable. In this paper, we\ninvestigate LM attention heads and memory values, the vectors the models\ndynamically create and recall while processing a given input. By analyzing the\ntokens they represent through this projection, we identify patterns in the\ninformation flow inside the attention mechanism. Based on our discoveries, we\ncreate a tool to visualize a forward pass of Generative Pre-trained\nTransformers (GPTs) as an interactive flow graph, with nodes representing\nneurons or hidden states and edges representing the interactions between them.\nOur visualization simplifies huge amounts of data into easy-to-read plots that\ncan reflect the models' internal processing, uncovering the contribution of\neach component to the models' final prediction. Our visualization also unveils\nnew insights about the role of layer norms as semantic filters that influence\nthe models' output, and about neurons that are always activated during forward\npasses and act as regularization vectors.\n","authors":["Shahar Katz","Yonatan Belinkov"],"pdf_url":"https://arxiv.org/pdf/2305.13417v2.pdf","comment":"EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2212.00509v3","updated":"2023-11-24T11:42:38Z","published":"2022-12-01T14:01:13Z","title":"CultureBERT: Measuring Corporate Culture With Transformer-Based Language\n Models","summary":" This paper introduces supervised machine learning to the literature measuring\ncorporate culture from text documents. We compile a unique data set of employee\nreviews that were labeled by human evaluators with respect to the information\nthe reviews reveal about the firms' corporate culture. Using this data set, we\nfine-tune state-of-the-art transformer-based language models to perform the\nsame classification task. In out-of-sample predictions, our language models\nclassify 16 to 28 percent points more of employee reviews in line with human\nevaluators than traditional approaches of text classification. We make our\nmodels publicly available.\n","authors":["Sebastian Koch","Stefan Pasch"],"pdf_url":"https://arxiv.org/pdf/2212.00509v3.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2310.19106v2","updated":"2023-11-24T11:22:11Z","published":"2023-10-29T18:43:19Z","title":"PACuna: Automated Fine-Tuning of Language Models for Particle\n Accelerators","summary":" Navigating the landscape of particle accelerators has become increasingly\nchallenging with recent surges in contributions. These intricate devices\nchallenge comprehension, even within individual facilities. To address this, we\nintroduce PACuna, a fine-tuned language model refined through publicly\navailable accelerator resources like conferences, pre-prints, and books. We\nautomated data collection and question generation to minimize expert\ninvolvement and make the data publicly available. PACuna demonstrates\nproficiency in addressing intricate accelerator questions, validated by\nexperts. Our approach shows adapting language models to scientific domains by\nfine-tuning technical texts and auto-generated corpora capturing the latest\ndevelopments can further produce pre-trained models to answer some intricate\nquestions that commercially available assistants cannot and can serve as\nintelligent assistants for individual facilities.\n","authors":["Antonin Sulc","Raimund Kammering","Annika Eichler","Tim Wilksen"],"pdf_url":"https://arxiv.org/pdf/2310.19106v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.11911v3","updated":"2023-11-24T10:41:46Z","published":"2023-09-21T09:22:07Z","title":"InstructERC: Reforming Emotion Recognition in Conversation with a\n Retrieval Multi-task LLMs Framework","summary":" The development of emotion recognition in dialogue (ERC) has been\nconsistently hindered by the complexity of pipeline designs, leading to ERC\nmodels that often overfit to specific datasets and dialogue patterns. In this\nstudy, we propose a novel approach, namely\n InstructERC, to reformulates the ERC task from a discriminative framework to\na generative framework based on Large Language Models (LLMs) . InstructERC has\ntwo significant contributions: Firstly, InstructERC introduces a simple yet\neffective retrieval template module, which helps the model explicitly integrate\nmulti-granularity dialogue supervision information by concatenating the\nhistorical dialog content, label statement, and emotional domain demonstrations\nwith high semantic similarity. Furthermore, we introduce two additional emotion\nalignment tasks, namely speaker identification and emotion prediction tasks, to\nimplicitly model the dialogue role relationships and future emotional\ntendencies in conversations. Our LLM-based plug-and-play plugin framework\nsignificantly outperforms all previous models and achieves comprehensive SOTA\non three commonly used ERC datasets. Extensive analysis of parameter-efficient\nand data-scaling experiments provide empirical guidance for applying\nInstructERC in practical scenarios. Our code will be released after blind\nreview.\n","authors":["Shanglin Lei","Guanting Dong","Xiaoping Wang","Keheng Wang","Sirui Wang"],"pdf_url":"https://arxiv.org/pdf/2309.11911v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14391v1","updated":"2023-11-24T10:15:34Z","published":"2023-11-24T10:15:34Z","title":"ÚFAL CorPipe at CRAC 2023: Larger Context Improves Multilingual\n Coreference Resolution","summary":" We present CorPipe, the winning entry to the CRAC 2023 Shared Task on\nMultilingual Coreference Resolution. Our system is an improved version of our\nearlier multilingual coreference pipeline, and it surpasses other participants\nby a large margin of 4.5 percent points. CorPipe first performs mention\ndetection, followed by coreference linking via an antecedent-maximization\napproach on the retrieved spans. Both tasks are trained jointly on all\navailable corpora using a shared pretrained language model. Our main\nimprovements comprise inputs larger than 512 subwords and changing the mention\ndecoding to support ensembling. The source code is available at\nhttps://github.com/ufal/crac2023-corpipe.\n","authors":["Milan Straka"],"pdf_url":"https://arxiv.org/pdf/2311.14391v1.pdf","comment":"Accepted to CRAC 2023 (the Sixth Workshop on Computational Models of\n Reference, Anaphora and Coreference)"},{"id":"http://arxiv.org/abs/2209.07278v2","updated":"2023-11-24T10:02:16Z","published":"2022-09-15T13:11:39Z","title":"ÚFAL CorPipe at CRAC 2022: Effectivity of Multilingual Models for\n Coreference Resolution","summary":" We describe the winning submission to the CRAC 2022 Shared Task on\nMultilingual Coreference Resolution. Our system first solves mention detection\nand then coreference linking on the retrieved spans with an\nantecedent-maximization approach, and both tasks are fine-tuned jointly with\nshared Transformer weights. We report results of fine-tuning a wide range of\npretrained models. The center of this contribution are fine-tuned multilingual\nmodels. We found one large multilingual model with sufficiently large encoder\nto increase performance on all datasets across the board, with the benefit not\nlimited only to the underrepresented languages or groups of typologically\nrelative languages. The source code is available at\nhttps://github.com/ufal/crac2022-corpipe.\n","authors":["Milan Straka","Jana Straková"],"pdf_url":"https://arxiv.org/pdf/2209.07278v2.pdf","comment":"Accepted to CRAC 2022 (Fifth Workshop on Computational Models of\n Reference, Anaphora and Coreference)"},{"id":"http://arxiv.org/abs/2311.13110v2","updated":"2023-11-24T09:18:44Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v2.pdf","comment":"This paper integrates the works arXiv:2306.01129 and arXiv:2308.16271\n into a complete story. In this paper, we improve the writing and\n organization, and also add conceptual, empirical, and theoretical\n improvements over the previous work. V2: small typo fixes and formatting\n improvements"},{"id":"http://arxiv.org/abs/2310.18075v4","updated":"2023-11-24T09:18:27Z","published":"2023-10-27T11:43:46Z","title":"DUMA: a Dual-Mind Conversational Agent with Fast and Slow Thinking","summary":" Inspired by the dual-process theory of human cognition, we introduce DUMA, a\nnovel conversational agent framework that embodies a dual-mind mechanism\nthrough the utilization of two generative Large Language Models (LLMs)\ndedicated to fast and slow thinking respectively. The fast thinking model\nserves as the primary interface for external interactions and initial response\ngeneration, evaluating the necessity for engaging the slow thinking model based\non the complexity of the complete response. When invoked, the slow thinking\nmodel takes over the conversation, engaging in meticulous planning, reasoning,\nand tool utilization to provide a well-analyzed response. This dual-mind\nconfiguration allows for a seamless transition between intuitive responses and\ndeliberate problem-solving processes based on the situation. We have\nconstructed a conversational agent to handle online inquiries in the real\nestate industry. The experiment proves that our method balances effectiveness\nand efficiency, and has a significant improvement compared to the baseline.\n","authors":["Xiaoyu Tian","Liangyu Chen","Na Liu","Yaxuan Liu","Wei Zou","Kaijiang Chen","Ming Cui"],"pdf_url":"https://arxiv.org/pdf/2310.18075v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09687v3","updated":"2023-11-24T09:13:54Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Michal Podstawski","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14353v1","updated":"2023-11-24T08:53:52Z","published":"2023-11-24T08:53:52Z","title":"Average Token Delay: A Duration-aware Latency Metric for Simultaneous\n Translation","summary":" Simultaneous translation is a task in which the translation begins before the\nend of an input speech segment. Its evaluation should be conducted based on\nlatency in addition to quality, and for users, the smallest possible amount of\nlatency is preferable. Most existing metrics measure latency based on the start\ntimings of partial translations and ignore their duration. This means such\nmetrics do not penalize the latency caused by long translation output, which\ndelays the comprehension of users and subsequent translations. In this work, we\npropose a novel latency evaluation metric for simultaneous translation called\n\\emph{Average Token Delay} (ATD) that focuses on the duration of partial\ntranslations. We demonstrate its effectiveness through analyses simulating\nuser-side latency based on Ear-Voice Span (EVS). In our experiment, ATD had the\nhighest correlation with EVS among baseline latency metrics under most\nconditions.\n","authors":["Yasumasa Kano","Katsuhito Sudoh","Satoshi Nakamura"],"pdf_url":"https://arxiv.org/pdf/2311.14353v1.pdf","comment":"Extended version of the paper (doi: 10.21437/Interspeech.2023-933)\n which appeared in INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2311.14324v1","updated":"2023-11-24T07:53:48Z","published":"2023-11-24T07:53:48Z","title":"Large Language Models as Topological Structure Enhancers for\n Text-Attributed Graphs","summary":" The latest advancements in large language models (LLMs) have revolutionized\nthe field of natural language processing (NLP). Inspired by the success of LLMs\nin NLP tasks, some recent work has begun investigating the potential of\napplying LLMs in graph learning tasks. However, most of the existing work\nfocuses on utilizing LLMs as powerful node feature augmenters, leaving\nemploying LLMs to enhance graph topological structures an understudied problem.\nIn this work, we explore how to leverage the information retrieval and text\ngeneration capabilities of LLMs to refine/enhance the topological structure of\ntext-attributed graphs (TAGs) under the node classification setting. First, we\npropose using LLMs to help remove unreliable edges and add reliable ones in the\nTAG. Specifically, we first let the LLM output the semantic similarity between\nnode attributes through delicate prompt designs, and then perform edge deletion\nand edge addition based on the similarity. Second, we propose using\npseudo-labels generated by the LLM to improve graph topology, that is, we\nintroduce the pseudo-label propagation as a regularization to guide the graph\nneural network (GNN) in learning proper edge weights. Finally, we incorporate\nthe two aforementioned LLM-based methods for graph topological refinement into\nthe process of GNN training, and perform extensive experiments on four\nreal-world datasets. The experimental results demonstrate the effectiveness of\nLLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain\non public benchmarks).\n","authors":["Shengyin Sun","Yuxiang Ren","Chen Ma","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.14324v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2307.07697v5","updated":"2023-11-24T07:53:11Z","published":"2023-07-15T03:31:38Z","title":"Think-on-Graph: Deep and Responsible Reasoning of Large Language Model\n on Knowledge Graph","summary":" Although large language models (LLMs) have achieved significant success in\nvarious tasks, they often struggle with hallucination problems, especially in\nscenarios requiring deep and responsible reasoning. These issues could be\npartially addressed by introducing external knowledge graphs (KG) in LLM\nreasoning. In this paper, we propose a new LLM-KG integrating paradigm\n``$\\hbox{LLM}\\otimes\\hbox{KG}$'' which treats the LLM as an agent to\ninteractively explore related entities and relations on KGs and perform\nreasoning based on the retrieved knowledge. We further implement this paradigm\nby introducing a new approach called Think-on-Graph (ToG), in which the LLM\nagent iteratively executes beam search on KG, discovers the most promising\nreasoning paths, and returns the most likely reasoning results. We use a number\nof well-designed experiments to examine and illustrate the following advantages\nof ToG: 1) compared with LLMs, ToG has better deep reasoning power; 2) ToG has\nthe ability of knowledge traceability and knowledge correctability by\nleveraging LLMs reasoning and expert feedback; 3) ToG provides a flexible\nplug-and-play framework for different LLMs, KGs and prompting strategies\nwithout any additional training cost; 4) the performance of ToG with small LLM\nmodels could exceed large LLM such as GPT-4 in certain scenarios and this\nreduces the cost of LLM deployment and application. As a training-free method\nwith lower computational cost and better generality, ToG achieves overall SOTA\nin 6 out of 9 datasets where most previous SOTAs rely on additional training.\n","authors":["Jiashuo Sun","Chengjin Xu","Lumingyuan Tang","Saizhuo Wang","Chen Lin","Yeyun Gong","Lionel M. Ni","Heung-Yeung Shum","Jian Guo"],"pdf_url":"https://arxiv.org/pdf/2307.07697v5.pdf","comment":"30 pages, 13 figures, 20 tables"},{"id":"http://arxiv.org/abs/2311.07585v2","updated":"2023-11-24T07:46:23Z","published":"2023-11-07T09:39:22Z","title":"Input Reconstruction Attack against Vertical Federated Large Language\n Models","summary":" Recently, large language models (LLMs) have drawn extensive attention from\nacademia and the public, due to the advent of the ChatGPT. While LLMs show\ntheir astonishing ability in text generation for various tasks, privacy\nconcerns limit their usage in real-life businesses. More specifically, either\nthe user's inputs (the user sends the query to the model-hosting server) or the\nmodel (the user downloads the complete model) itself will be revealed during\nthe usage. Vertical federated learning (VFL) is a promising solution to this\nkind of problem. It protects both the user's input and the knowledge of the\nmodel by splitting the model into a bottom part and a top part, which is\nmaintained by the user and the model provider, respectively. However, in this\npaper, we demonstrate that in LLMs, VFL fails to protect the user input since\nit is simple and cheap to reconstruct the input from the intermediate\nembeddings. Experiments show that even with a commercial GPU, the input\nsentence can be reconstructed in only one second. We also discuss several\npossible solutions to enhance the privacy of vertical federated LLMs.\n","authors":["Fei Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.07585v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.04483v2","updated":"2023-11-24T07:26:10Z","published":"2023-10-06T12:33:32Z","title":"Reward Dropout Improves Control: Bi-objective Perspective on Reinforced\n LM","summary":" We study the theoretical aspects of Reinforced Language Models (RLMs) from a\nbi-objective optimization perspective. Specifically, we consider the RLMs as a\nPareto optimization problem that maximizes the two conflicting objectives,\ni.e., reward objective and likelihood objectives, simultaneously. Our main\ncontribution consists of three parts. First, we establish the theoretical\nfoundations of RLM as a Pareto optimization problem by presenting Reward Upper\nBOund (RUBO) and Pareto optimality. Our theoretical outcomes are supported by\nnot only deductive proofs but also empirical results. Second, we propose Reward\nDropout, a simple yet powerful method that guarantees to improve a bi-objective\noptimization of RLM. Lastly, we demonstrate that the Reward Dropout is\nconsistently effective across five benchmark datasets and four benchmark LLMs,\nmeaning that the Reward Dropout significantly improves the optimization\nperformance of RLMs.\n","authors":["Changhun Lee","Chiehyeon Lim"],"pdf_url":"https://arxiv.org/pdf/2310.04483v2.pdf","comment":"29 pages, 13 figures, conference"},{"id":"http://arxiv.org/abs/2310.14356v2","updated":"2023-11-24T05:55:12Z","published":"2023-10-22T16:51:42Z","title":"Cultural and Linguistic Diversity Improves Visual Representations","summary":" Computer vision often treats perception as objective, and this assumption\ngets reflected in the way that datasets are collected and models are trained.\nFor instance, image descriptions in different languages are typically assumed\nto be translations of the same semantic content. However, work in\ncross-cultural psychology and linguistics has shown that individuals differ in\ntheir visual perception depending on their cultural background and the language\nthey speak. In this paper, we demonstrate significant differences in semantic\ncontent across languages in both dataset and model-produced captions. When data\nis multilingual as opposed to monolingual, captions have higher semantic\ncoverage on average, as measured by scene graph, embedding, and linguistic\ncomplexity. For example, multilingual captions have on average 21.8% more\nobjects, 24.5% more relations, and 27.1% more attributes than a set of\nmonolingual captions. Moreover, models trained on content from different\nlanguages perform best against test data from those languages, while those\ntrained on multilingual content perform consistently well across all evaluation\ndata compositions. Our research provides implications for how diverse modes of\nperception can improve image understanding.\n","authors":["Andre Ye","Sebastin Santy","Jena D. Hwang","Amy X. Zhang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2310.14356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10475v6","updated":"2023-11-24T03:45:57Z","published":"2023-03-18T19:17:47Z","title":"Is Prompt All You Need? No. A Comprehensive and Broader View of\n Instruction Learning","summary":" Task semantics can be expressed by a set of input-to-output examples or a\npiece of textual instruction. Conventional machine learning approaches for\nnatural language processing (NLP) mainly rely on the availability of\nlarge-scale sets of task-specific examples. Two issues arise: first, collecting\ntask-specific labeled examples does not apply to scenarios where tasks may be\ntoo complicated or costly to annotate, or the system is required to handle a\nnew task immediately; second, this is not user-friendly since end-users are\nprobably more willing to provide task description rather than a set of examples\nbefore using the system. Therefore, the community is paying increasing interest\nin a new supervision-seeking paradigm for NLP: learning from task instructions.\nDespite its impressive progress, there are some common issues that the\ncommunity struggles with. This survey paper tries to summarize and provide\ninsights into the current research on instruction learning, particularly by\nanswering the following questions: (i) What is task instruction, and what\ninstruction types exist? (ii) How to model instructions? (iii) What factors\ninfluence and explain the instructions' performance? (iv) What challenges\nremain in instruction learning? To our knowledge, this is the first\ncomprehensive survey about textual instructions.\n","authors":["Renze Lou","Kai Zhang","Wenpeng Yin"],"pdf_url":"https://arxiv.org/pdf/2303.10475v6.pdf","comment":"Preprint. The paper list is available at\n https://github.com/RenzeLou/awesome-instruction-learning"},{"id":"http://arxiv.org/abs/2311.12727v2","updated":"2023-11-24T03:27:31Z","published":"2023-11-21T17:03:21Z","title":"Soft Random Sampling: A Theoretical and Empirical Analysis","summary":" Soft random sampling (SRS) is a simple yet effective approach for efficient\ntraining of large-scale deep neural networks when dealing with massive data.\nSRS selects a subset uniformly at random with replacement from the full data\nset in each epoch. In this paper, we conduct a theoretical and empirical\nanalysis of SRS. First, we analyze its sampling dynamics including data\ncoverage and occupancy. Next, we investigate its convergence with non-convex\nobjective functions and give the convergence rate. Finally, we provide its\ngeneralization performance. We empirically evaluate SRS for image recognition\non CIFAR10 and automatic speech recognition on Librispeech and an in-house\npayload dataset to demonstrate its effectiveness. Compared to existing\ncoreset-based data selection methods, SRS offers a better accuracy-efficiency\ntrade-off. Especially on real-world industrial scale data sets, it is shown to\nbe a powerful training strategy with significant speedup and competitive\nperformance with almost no additional computing cost.\n","authors":["Xiaodong Cui","Ashish Mittal","Songtao Lu","Wei Zhang","George Saon","Brian Kingsbury"],"pdf_url":"https://arxiv.org/pdf/2311.12727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17623v2","updated":"2023-11-24T01:45:16Z","published":"2023-10-26T17:43:13Z","title":"Proving Test Set Contamination in Black Box Language Models","summary":" Large language models are trained on vast amounts of internet data, prompting\nconcerns and speculation that they have memorized public benchmarks. Going from\nspeculation to proof of contamination is challenging, as the pretraining data\nused by proprietary models are often not publicly accessible. We show that it\nis possible to provide provable guarantees of test set contamination in\nlanguage models without access to pretraining data or model weights. Our\napproach leverages the fact that when there is no data contamination, all\norderings of an exchangeable benchmark should be equally likely. In contrast,\nthe tendency for language models to memorize example order means that a\ncontaminated language model will find certain canonical orderings to be much\nmore likely than others. Our test flags potential contamination whenever the\nlikelihood of a canonically ordered benchmark dataset is significantly higher\nthan the likelihood after shuffling the examples. We demonstrate that our\nprocedure is sensitive enough to reliably prove test set contamination in\nchallenging situations, including models as small as 1.4 billion parameters, on\nsmall test sets of only 1000 examples, and datasets that appear only a few\ntimes in the pretraining corpus. Using our test, we audit five popular publicly\naccessible language models for test set contamination and find little evidence\nfor pervasive contamination.\n","authors":["Yonatan Oren","Nicole Meister","Niladri Chatterji","Faisal Ladhak","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2310.17623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14871v1","updated":"2023-11-24T23:32:13Z","published":"2023-11-24T23:32:13Z","title":"Tracing Influence at Scale: A Contrastive Learning Approach to Linking\n Public Comments and Regulator Responses","summary":" U.S. Federal Regulators receive over one million comment letters each year\nfrom businesses, interest groups, and members of the public, all advocating for\nchanges to proposed regulations. These comments are believed to have\nwide-ranging impacts on public policy. However, measuring the impact of\nspecific comments is challenging because regulators are required to respond to\ncomments but they do not have to specify which comments they are addressing. In\nthis paper, we propose a simple yet effective solution to this problem by using\nan iterative contrastive method to train a neural model aiming for matching\ntext from public comments to responses written by regulators. We demonstrate\nthat our proposal substantially outperforms a set of selected text-matching\nbaselines on a human-annotated test set. Furthermore, it delivers performance\ncomparable to the most advanced gigantic language model (i.e., GPT-4), and is\nmore cost-effective when handling comments and regulator responses matching in\nlarger scale.\n","authors":["Linzi Xing","Brad Hackinen","Giuseppe Carenini"],"pdf_url":"https://arxiv.org/pdf/2311.14871v1.pdf","comment":"Accepted to the Natural Legal Language Processing Workshop 2023 (NLLP\n 2023)"},{"id":"http://arxiv.org/abs/2311.14865v1","updated":"2023-11-24T23:00:36Z","published":"2023-11-24T23:00:36Z","title":"Improving Cross-Domain Hate Speech Generalizability with Emotion\n Knowledge","summary":" Reliable automatic hate speech (HS) detection systems must adapt to the\nin-flow of diverse new data to curtail hate speech. However, hate speech\ndetection systems commonly lack generalizability in identifying hate speech\ndissimilar to data used in training, impeding their robustness in real-world\ndeployments. In this work, we propose a hate speech generalization framework\nthat leverages emotion knowledge in a multitask architecture to improve the\ngeneralizability of hate speech detection in a cross-domain setting. We\ninvestigate emotion corpora with varying emotion categorical scopes to\ndetermine the best corpus scope for supplying emotion knowledge to foster\ngeneralized hate speech detection. We further assess the relationship between\nusing pretrained Transformers models adapted for hate speech and its effect on\nour emotion-enriched hate speech generalization model. We perform extensive\nexperiments on six publicly available datasets sourced from different online\ndomains and show that our emotion-enriched HS detection generalization method\ndemonstrates consistent generalization improvement in cross-domain evaluation,\nincreasing generalization performance up to 18.1% and average cross-domain\nperformance up to 8.5%, according to the F1 measure.\n","authors":["Shi Yin Hong","Susan Gauch"],"pdf_url":"https://arxiv.org/pdf/2311.14865v1.pdf","comment":"Accepted to Pacific Asia Conference on Language, Information and\n Computation (PACLIC 37)"},{"id":"http://arxiv.org/abs/2310.04914v2","updated":"2023-11-24T22:25:07Z","published":"2023-10-07T20:57:54Z","title":"Analyzing Zero-Shot Abilities of Vision-Language Models on Video\n Understanding Tasks","summary":" Foundational multimodal models pre-trained on large scale image-text pairs or\nvideo-text pairs or both have shown strong generalization abilities on\ndownstream tasks. However unlike image-text models, pretraining video-text\nmodels is always not feasible due to the difficulty in collecting large-scale\nclean and aligned data, and exponential computational costs involved in the\npretraining phase. Therefore, the pertinent question to ask is: Can image-text\nmodels be adapted to video tasks and is there any benefit to using these models\nover pretraining directly on videos? In this work, we focus on this question by\nproposing a detailed study on the generalization abilities of image-text models\nwhen evaluated on video understanding tasks in a zero-shot setting. We\ninvestigate 9 foundational image-text models on a diverse set of video tasks\nthat include video action recognition (video AR), video retrieval (video RT),\nvideo question answering (video QA), video multiple choice (video MC) and video\ncaptioning (video CP). Our experiments show that image-text models exhibit\nimpressive performance on video AR, video RT and video MC. Furthermore, they\nperform moderately on video captioning and poorly on video QA. These findings\nshed a light on the benefits of adapting foundational image-text models to an\narray of video tasks while avoiding the costly pretraining step.\n","authors":["Avinash Madasu","Anahita Bhiwandiwalla","Vasudev Lal"],"pdf_url":"https://arxiv.org/pdf/2310.04914v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12986v2","updated":"2023-11-24T22:24:50Z","published":"2023-11-21T20:45:55Z","title":"Unsupervised Graph Attention Autoencoder for Attributed Networks using\n K-means Loss","summary":" Several natural phenomena and complex systems are often represented as\nnetworks. Discovering their community structure is a fundamental task for\nunderstanding these networks. Many algorithms have been proposed, but recently,\nGraph Neural Networks (GNN) have emerged as a compelling approach for enhancing\nthis task.In this paper, we introduce a simple, efficient, and\nclustering-oriented model based on unsupervised \\textbf{G}raph Attention\n\\textbf{A}uto\\textbf{E}ncoder for community detection in attributed networks\n(GAECO). The proposed model adeptly learns representations from both the\nnetwork's topology and attribute information, simultaneously addressing dual\nobjectives: reconstruction and community discovery. It places a particular\nemphasis on discovering compact communities by robustly minimizing clustering\nerrors. The model employs k-means as an objective function and utilizes a\nmulti-head Graph Attention Auto-Encoder for decoding the representations.\nExperiments conducted on three datasets of attributed networks show that our\nmethod surpasses state-of-the-art algorithms in terms of NMI and ARI.\nAdditionally, our approach scales effectively with the size of the network,\nmaking it suitable for large-scale applications. The implications of our\nfindings extend beyond biological network interpretation and social network\nanalysis, where knowledge of the fundamental community structure is essential.\n","authors":["Abdelfateh Bekkaira","Slimane Bellaouar","Slimane Oulad-Naoui"],"pdf_url":"https://arxiv.org/pdf/2311.12986v2.pdf","comment":"7 pages, 5 Figures"},{"id":"http://arxiv.org/abs/2310.08540v2","updated":"2023-11-24T20:24:52Z","published":"2023-10-12T17:32:09Z","title":"Do pretrained Transformers Really Learn In-context by Gradient Descent?","summary":" Is In-Context Learning (ICL) implicitly equivalent to Gradient Descent (GD)?\nSeveral recent works draw analogies between the dynamics of GD and the emergent\nbehavior of ICL in large language models. However, these works make assumptions\nfar from the realistic natural language setting in which language models are\ntrained. Therefore, such discrepancies between theory and practice necessitate\nfurther investigation to validate their applicability.\n We start by highlighting the assumptions in prior works that construct\nTransformer weights to simulate gradient descent. Their experiments with\ntraining Transformers on ICL objective, inconsistencies in the order\nsensitivity of ICL and GD, sparsity of the constructed weights, and sensitivity\nto parameter changes are some examples of mismatch from the real-world setting.\n Furthermore, we probe and compare the ICL vs. GD hypothesis in a natural\nsetting. We conduct comprehensive empirical analyses on language models\npretrained on natural data (LLaMa-7B). Our comparisons on various performance\nmetrics highlight the inconsistent behavior of ICL and GD as a function of\nvarious factors such as datasets, models, and the number of demonstrations. We\nobserve that ICL and GD modify the output distribution of language models\ndifferently. These results indicate that the equivalence between ICL and GD is\nan open hypothesis, requires nuanced considerations, and calls for further\nstudies.\n","authors":["Lingfeng Shen","Aayush Mishra","Daniel Khashabi"],"pdf_url":"https://arxiv.org/pdf/2310.08540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14838v1","updated":"2023-11-24T20:24:00Z","published":"2023-11-24T20:24:00Z","title":"OpusCleaner and OpusTrainer, open source toolkits for training Machine\n Translation and Large language models","summary":" Developing high quality machine translation systems is a labour intensive,\nchallenging and confusing process for newcomers to the field. We present a pair\nof tools OpusCleaner and OpusTrainer that aim to simplify the process, reduce\nthe amount of work and lower the entry barrier for newcomers.\n OpusCleaner is a data downloading, cleaning, and proprocessing toolkit. It is\ndesigned to allow researchers to quickly download, visualise and preprocess\nbilingual (or monolingual) data that comes from many different sources, each of\nthem with different quality, issues, and unique filtering/preprocessing\nrequirements.\n OpusTrainer is a data scheduling and data augmenting tool aimed at building\nlarge scale, robust machine translation systems and large language models. It\nfeatures deterministic data mixing from many different sources, on-the-fly data\naugmentation and more.\n Using these tools, we showcase how we can use it to create high quality\nmachine translation model robust to noisy user input; multilingual models and\nterminology aware models.\n","authors":["Nikolay Bogoychev","Jelmer van der Linde","Graeme Nail","Barry Haddow","Jaume Zaragoza-Bernabeu","Gema Ramírez-Sánchez","Lukas Weymann","Tudor Nicolae Mateiu","Jindřich Helcl","Mikko Aulamo"],"pdf_url":"https://arxiv.org/pdf/2311.14838v1.pdf","comment":"Code on Github: https://github.com/hplt-project/OpusCleaner and\n https://github.com/hplt-project/OpusTrainer"},{"id":"http://arxiv.org/abs/2311.14836v1","updated":"2023-11-24T20:16:29Z","published":"2023-11-24T20:16:29Z","title":"Custom Data Augmentation for low resource ASR using Bark and\n Retrieval-Based Voice Conversion","summary":" This paper proposes two innovative methodologies to construct customized\nCommon Voice datasets for low-resource languages like Hindi. The first\nmethodology leverages Bark, a transformer-based text-to-audio model developed\nby Suno, and incorporates Meta's enCodec and a pre-trained HuBert model to\nenhance Bark's performance. The second methodology employs Retrieval-Based\nVoice Conversion (RVC) and uses the Ozen toolkit for data preparation. Both\nmethodologies contribute to the advancement of ASR technology and offer\nvaluable insights into addressing the challenges of constructing customized\nCommon Voice datasets for under-resourced languages. Furthermore, they provide\na pathway to achieving high-quality, personalized voice generation for a range\nof applications.\n","authors":["Anand Kamble","Aniket Tathe","Suyash Kumbharkar","Atharva Bhandare","Anirban C. Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.14836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14835v1","updated":"2023-11-24T20:14:28Z","published":"2023-11-24T20:14:28Z","title":"Weak Alignment Supervision from Hybrid Model Improves End-to-end ASR","summary":" In this paper, we aim to create weak alignment supervision to aid the\nend-to-end modeling. Towards this end, we use the existing hybrid ASR system to\nproduce triphone alignments of the training audios. We then create a\ncross-entropy loss at a certain layer of the encoder using the derived\nalignments. In contrast to the general one-hot cross-entropy losses with or\nwithout loss weighting, here we use a cross-entropy loss with a label smoothing\nparameter to regularize the supervision. As a comparison, we also conduct the\nexperiments with one-hot cross-entropy losses and CTC losses with loss\nweighting. The results show that placing the weak alignment supervision with\nthe label smoothing parameter of 0.5 at the third encoder layer outperforms the\nother two approaches and leads to about 5% relative WER reduction on the\nTED-LIUM 2 dataset over the baseline. We see similar improvements when applying\nthe method out-of-the-box on a Tagalog end-to-end ASR system.\n","authors":["Jintao Jiang","Yingbo Gao","Zoltan Tuske"],"pdf_url":"https://arxiv.org/pdf/2311.14835v1.pdf","comment":"7 pages, 7 figures, and 5 tables"},{"id":"http://arxiv.org/abs/2311.14808v1","updated":"2023-11-24T19:05:57Z","published":"2023-11-24T19:05:57Z","title":"Data-to-Text Bilingual Generation","summary":" This document illustrates the use of pyrealb for generating two parallel\ntexts (English and French) from a single source of data. The data selection and\ntext organisation processes are shared between the two languages. only language\ndependent word and phrasing choices are distinct processes. The realized texts\nthus convey identical information in both languages without the risk of being\nlost in translation. This is especially important in cases where strict and\nsimultaneous bilingualism is required. We first present the types of\napplications targeted by this approach and how the pyrealb English and French\nrealizer can be used for achieving this goal in a natural way. We describe an\nobject-oriented organization to ensure a convenient realization in both\nlanguages. To illustrate the process, different types of applications are then\nbriefly sketched with links to the source code. A brief comparison of the text\ngeneration is given with the output of an instance of a GPT.\n","authors":["Guy Lapalme"],"pdf_url":"https://arxiv.org/pdf/2311.14808v1.pdf","comment":"30 pages"},{"id":"http://arxiv.org/abs/2311.14788v1","updated":"2023-11-24T18:41:16Z","published":"2023-11-24T18:41:16Z","title":"Evaluating Large Language Models through Gender and Racial Stereotypes","summary":" Language Models have ushered a new age of AI gaining traction within the NLP\ncommunity as well as amongst the general population. AI's ability to make\npredictions, generations and its applications in sensitive decision-making\nscenarios, makes it even more important to study these models for possible\nbiases that may exist and that can be exaggerated. We conduct a quality\ncomparative study and establish a framework to evaluate language models under\nthe premise of two kinds of biases: gender and race, in a professional setting.\nWe find out that while gender bias has reduced immensely in newer models, as\ncompared to older ones, racial bias still exists.\n","authors":["Ananya Malik"],"pdf_url":"https://arxiv.org/pdf/2311.14788v1.pdf","comment":"8 pages, 12 figures, 6 tables"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.14671v1","updated":"2023-11-24T18:59:42Z","published":"2023-11-24T18:59:42Z","title":"SEGIC: Unleashing the Emergent Correspondence for In-Context\n Segmentation","summary":" In-context segmentation aims at segmenting novel images using a few labeled\nexample images, termed as \"in-context examples\", exploring content similarities\nbetween examples and the target. The resulting models can be generalized\nseamlessly to novel segmentation tasks, significantly reducing the labeling and\ntraining costs compared with conventional pipelines. However, in-context\nsegmentation is more challenging than classic ones due to its meta-learning\nnature, requiring the model to learn segmentation rules conditioned on a few\nsamples, not just the segmentation. Unlike previous work with ad-hoc or\nnon-end-to-end designs, we propose SEGIC, an end-to-end segment-in-context\nframework built upon a single vision foundation model (VFM). In particular,\nSEGIC leverages the emergent correspondence within VFM to capture dense\nrelationships between target images and in-context samples. As such,\ninformation from in-context samples is then extracted into three types of\ninstructions, i.e. geometric, visual, and meta instructions, serving as\nexplicit conditions for the final mask prediction. SEGIC is a straightforward\nyet effective approach that yields state-of-the-art performance on one-shot\nsegmentation benchmarks. Notably, SEGIC can be easily generalized to diverse\ntasks, including video object segmentation and open-vocabulary segmentation.\nCode will be available at \\url{https://github.com/MengLcool/SEGIC}.\n","authors":["Lingchen Meng","Shiyi Lan","Hengduo Li","Jose M. Alvarez","Zuxuan Wu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.14671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14665v1","updated":"2023-11-24T18:55:53Z","published":"2023-11-24T18:55:53Z","title":"Understanding Self-Supervised Features for Learning Unsupervised\n Instance Segmentation","summary":" Self-supervised learning (SSL) can be used to solve complex visual tasks\nwithout human labels. Self-supervised representations encode useful semantic\ninformation about images, and as a result, they have already been used for\ntasks such as unsupervised semantic segmentation. In this paper, we investigate\nself-supervised representations for instance segmentation without any manual\nannotations. We find that the features of different SSL methods vary in their\nlevel of instance-awareness. In particular, DINO features, which are known to\nbe excellent semantic descriptors, lack behind MAE features in their\nsensitivity for separating instances.\n","authors":["Paul Engstler","Luke Melas-Kyriazi","Christian Rupprecht","Iro Laina"],"pdf_url":"https://arxiv.org/pdf/2311.14665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11744v3","updated":"2023-11-24T18:53:31Z","published":"2022-11-21T18:59:33Z","title":"Visual Dexterity: In-Hand Reorientation of Novel and Complex Object\n Shapes","summary":" In-hand object reorientation is necessary for performing many dexterous\nmanipulation tasks, such as tool use in less structured environments that\nremain beyond the reach of current robots. Prior works built reorientation\nsystems assuming one or many of the following: reorienting only specific\nobjects with simple shapes, limited range of reorientation, slow or quasistatic\nmanipulation, simulation-only results, the need for specialized and costly\nsensor suites, and other constraints which make the system infeasible for\nreal-world deployment. We present a general object reorientation controller\nthat does not make these assumptions. It uses readings from a single commodity\ndepth camera to dynamically reorient complex and new object shapes by any\nrotation in real-time, with the median reorientation time being close to seven\nseconds. The controller is trained using reinforcement learning in simulation\nand evaluated in the real world on new object shapes not used for training,\nincluding the most challenging scenario of reorienting objects held in the air\nby a downward-facing hand that must counteract gravity during reorientation.\nOur hardware platform only uses open-source components that cost less than five\nthousand dollars. Although we demonstrate the ability to overcome assumptions\nin prior work, there is ample scope for improving absolute performance. For\ninstance, the challenging duck-shaped object not used for training was dropped\nin 56 percent of the trials. When it was not dropped, our controller reoriented\nthe object within 0.4 radians (23 degrees) 75 percent of the time. Videos are\navailable at: https://taochenshh.github.io/projects/visual-dexterity.\n","authors":["Tao Chen","Megha Tippur","Siyang Wu","Vikash Kumar","Edward Adelson","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2211.11744v3.pdf","comment":"Published in Science Robotics:\n https://www.science.org/doi/10.1126/scirobotics.adc9244"},{"id":"http://arxiv.org/abs/2311.14656v1","updated":"2023-11-24T18:46:02Z","published":"2023-11-24T18:46:02Z","title":"Charting New Territories: Exploring the Geographic and Geospatial\n Capabilities of Multimodal LLMs","summary":" Multimodal large language models (MLLMs) have shown remarkable capabilities\nacross a broad range of tasks but their knowledge and abilities in the\ngeographic and geospatial domains are yet to be explored, despite potential\nwide-ranging benefits to navigation, environmental research, urban development,\nand disaster response. We conduct a series of experiments exploring various\nvision capabilities of MLLMs within these domains, particularly focusing on the\nfrontier model GPT-4V, and benchmark its performance against open-source\ncounterparts. Our methodology involves challenging these models with a\nsmall-scale geographic benchmark consisting of a suite of visual tasks, testing\ntheir abilities across a spectrum of complexity. The analysis uncovers not only\nwhere such models excel, including instances where they outperform humans, but\nalso where they falter, providing a balanced view of their capabilities in the\ngeographic domain. To enable the comparison and evaluation of future models,\nour benchmark will be publicly released.\n","authors":["Jonathan Roberts","Timo Lüddecke","Rehan Sheikh","Kai Han","Samuel Albanie"],"pdf_url":"https://arxiv.org/pdf/2311.14656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05303v3","updated":"2023-11-24T18:43:54Z","published":"2023-08-10T02:47:36Z","title":"Multi-Visual-Inertial System: Analysis, Calibration and Estimation","summary":" In this paper, we study state estimation of multi-visual-inertial systems\n(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary\nnumber of asynchronous inertial measurement units (IMUs) or gyroscopes and\nglobal and(or) rolling shutter cameras. We are especially interested in the\nfull calibration of the associated visual-inertial sensors, including the IMU\nor camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as\nwell as the image readout time of rolling-shutter cameras (if used). To this\nend, we develop a new analytic combined IMU integration with intrinsics-termed\nACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary\nIMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial\nmeasurements to include all the necessary inertial intrinsic and IMU-IMU\nspatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body\nconstraints to eliminate the necessity of auxiliary inertial poses and thus\nreducing computational complexity. By performing observability analysis of\nMVIS, we prove that the standard four unobservable directions remain - no\nmatter how many inertial sensors are used, and also identify, for the first\ntime, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary\ninertial intrinsics. In addition to the extensive simulations that validate our\nanalysis and algorithms, we have built our own MVIS sensor rig and collected\nover 25 real-world datasets to experimentally verify the proposed calibration\nagainst the state-of-the-art calibration method such as Kalibr. We show that\nthe proposed MVIS calibration is able to achieve competing accuracy with\nimproved convergence and repeatability, which is open sourced to better benefit\nthe community.\n","authors":["Yulin Yang","Patrick Geneva","Guoquan Huang"],"pdf_url":"https://arxiv.org/pdf/2308.05303v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17455v3","updated":"2023-11-24T18:39:02Z","published":"2023-05-27T12:07:21Z","title":"CrossGET: Cross-Guided Ensemble of Tokens for Accelerating\n Vision-Language Transformers","summary":" Recent vision-language models have achieved tremendous progress far beyond\nwhat we ever expected. However, their computational costs are also dramatically\ngrowing with rapid development, especially for the large models. It makes model\nacceleration exceedingly critical in a scenario of limited resources. Although\nextensively studied for unimodal models, the acceleration for multimodal\nmodels, especially the vision-language Transformers, is relatively\nunder-explored. To pursue more efficient and accessible vision-language\nTransformers, this paper introduces \\textbf{Cross}-\\textbf{G}uided\n\\textbf{E}nsemble of \\textbf{T}okens (\\textbf{\\emph{CrossGET}}), a universal\nacceleration framework for vision-language Transformers. This framework\nadaptively combines tokens through real-time, cross-modal guidance, thereby\nachieving substantial acceleration while keeping high performance.\n\\textit{CrossGET} has two key innovations: 1) \\textit{Cross-Guided Matching and\nEnsemble}. \\textit{CrossGET} incorporates cross-modal guided token matching and\nensemble to exploit cross-modal information effectively, only introducing\ncross-modal tokens with negligible extra parameters. 2) \\textit{Complete-Graph\nSoft Matching}. In contrast to the existing bipartite soft matching approach,\n\\textit{CrossGET} introduces a complete-graph soft matching policy to achieve\nmore reliable token-matching results while maintaining parallelizability and\nhigh efficiency. Extensive experiments are conducted on various vision-language\ntasks, including image-text retrieval, visual reasoning, image captioning, and\nvisual question answering. Performance on both classic multimodal architectures\nand emerging multimodal LLMs demonstrate the effectiveness and versatility of\nthe proposed \\textit{CrossGET} framework. The code will be at\n\\url{https://github.com/sdc17/CrossGET}.\n","authors":["Dachuan Shi","Chaofan Tao","Anyi Rao","Zhendong Yang","Chun Yuan","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2305.17455v3.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2311.14642v1","updated":"2023-11-24T18:16:28Z","published":"2023-11-24T18:16:28Z","title":"Continuous football player tracking from discrete broadcast data","summary":" Player tracking data remains out of reach for many professional football\nteams as their video feeds are not sufficiently high quality for computer\nvision technologies to be used. To help bridge this gap, we present a method\nthat can estimate continuous full-pitch tracking data from discrete data made\nfrom broadcast footage. Such data could be collected by clubs or players at a\nsimilar cost to event data, which is widely available down to semi-professional\nlevel. We test our method using open-source tracking data, and include a\nversion that can be applied to a large set of over 200 games with such discrete\ndata.\n","authors":["Matthew J. Penn","Christl A. Donnelly","Samir Bhatt"],"pdf_url":"https://arxiv.org/pdf/2311.14642v1.pdf","comment":"12 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.14639v1","updated":"2023-11-24T18:12:06Z","published":"2023-11-24T18:12:06Z","title":"Unsupervised high-throughput segmentation of cells and cell nuclei in\n quantitative phase images","summary":" In the effort to aid cytologic diagnostics by establishing automatic single\ncell screening using high throughput digital holographic microscopy for\nclinical studies thousands of images and millions of cells are captured. The\nbottleneck lies in an automatic, fast, and unsupervised segmentation technique\nthat does not limit the types of cells which might occur. We propose an\nunsupervised multistage method that segments correctly without confusing noise\nor reflections with cells and without missing cells that also includes the\ndetection of relevant inner structures, especially the cell nucleus in the\nunstained cell. In an effort to make the information reasonable and\ninterpretable for cytopathologists, we also introduce new cytoplasmic and\nnuclear features of potential help for cytologic diagnoses which exploit the\nquantitative phase information inherent to the measurement scheme. We show that\nthe segmentation provides consistently good results over many experiments on\npatient samples in a reasonable per cell analysis time.\n","authors":["Julia Sistermanns","Ellen Emken","Gregor Weirich","Oliver Hayden","Wolfgang Utschick"],"pdf_url":"https://arxiv.org/pdf/2311.14639v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2311.14635v1","updated":"2023-11-24T18:08:42Z","published":"2023-11-24T18:08:42Z","title":"Automated Detection and Counting of Windows using UAV Imagery based\n Remote Sensing","summary":" Despite the technological advancements in the construction and surveying\nsector, the inspection of salient features like windows in an\nunder-construction or existing building is predominantly a manual process.\nMoreover, the number of windows present in a building is directly related to\nthe magnitude of deformation it suffers under earthquakes. In this research, a\nmethod to accurately detect and count the number of windows of a building by\ndeploying an Unmanned Aerial Vehicle (UAV) based remote sensing system is\nproposed. The proposed two-stage method automates the identification and\ncounting of windows by developing computer vision pipelines that utilize data\nfrom UAV's onboard camera and other sensors. Quantitative and Qualitative\nresults show the effectiveness of our proposed approach in accurately detecting\nand counting the windows compared to the existing method.\n","authors":["Dhruv Patel","Shivani Chepuri","Sarvesh Thakur","K. Harikumar","Ravi Kiran S.","K. Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2311.14635v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14633v1","updated":"2023-11-24T18:02:14Z","published":"2023-11-24T18:02:14Z","title":"One Strike, You're Out: Detecting Markush Structures in Low\n Signal-to-Noise Ratio Images","summary":" Modern research increasingly relies on automated methods to assist\nresearchers. An example of this is Optical Chemical Structure Recognition\n(OCSR), which aids chemists in retrieving information about chemicals from\nlarge amounts of documents. Markush structures are chemical structures that\ncannot be parsed correctly by OCSR and cause errors. The focus of this research\nwas to propose and test a novel method for classifying Markush structures.\nWithin this method, a comparison was made between fixed-feature extraction and\nend-to-end learning (CNN). The end-to-end method performed significantly better\nthan the fixed-feature method, achieving 0.928 (0.035 SD) Macro F1 compared to\nthe fixed-feature method's 0.701 (0.052 SD). Because of the nature of the\nexperiment, these figures are a lower bound and can be improved further. These\nresults suggest that Markush structures can be filtered out effectively and\naccurately using the proposed method. When implemented into OCSR pipelines,\nthis method can improve their performance and use to other researchers.\n","authors":["Thomas Jurriaans","Kinga Szarkowska","Eric Nalisnick","Markus Schwoerer","Camilo Thorne","Saber Akhondi"],"pdf_url":"https://arxiv.org/pdf/2311.14633v1.pdf","comment":"15 pages, 9 tables, 16 figures"},{"id":"http://arxiv.org/abs/2311.14631v1","updated":"2023-11-24T17:55:10Z","published":"2023-11-24T17:55:10Z","title":"CatVersion: Concatenating Embeddings for Diffusion-Based Text-to-Image\n Personalization","summary":" We propose CatVersion, an inversion-based method that learns the personalized\nconcept through a handful of examples. Subsequently, users can utilize text\nprompts to generate images that embody the personalized concept, thereby\nachieving text-to-image personalization. In contrast to existing approaches\nthat emphasize word embedding learning or parameter fine-tuning for the\ndiffusion model, which potentially causes concept dilution or overfitting, our\nmethod concatenates embeddings on the feature-dense space of the text encoder\nin the diffusion model to learn the gap between the personalized concept and\nits base class, aiming to maximize the preservation of prior knowledge in\ndiffusion models while restoring the personalized concepts. To this end, we\nfirst dissect the text encoder's integration in the image generation process to\nidentify the feature-dense space of the encoder. Afterward, we concatenate\nembeddings on the Keys and Values in this space to learn the gap between the\npersonalized concept and its base class. In this way, the concatenated\nembeddings ultimately manifest as a residual on the original attention output.\nTo more accurately and unbiasedly quantify the results of personalized image\ngeneration, we improve the CLIP image alignment score based on masks.\nQualitatively and quantitatively, CatVersion helps to restore personalization\nconcepts more faithfully and enables more robust editing.\n","authors":["Ruoyu Zhao","Mingrui Zhu","Shiyin Dong","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2311.14631v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.14625v1","updated":"2023-11-24T17:40:31Z","published":"2023-11-24T17:40:31Z","title":"ARIA: On the interaction between Architectures, Aggregation methods and\n Initializations in federated visual classification","summary":" Federated Learning (FL) is a collaborative training paradigm that allows for\nprivacy-preserving learning of cross-institutional models by eliminating the\nexchange of sensitive data and instead relying on the exchange of model\nparameters between the clients and a server. Despite individual studies on how\nclient models are aggregated, and, more recently, on the benefits of ImageNet\npre-training, there is a lack of understanding of the effect the architecture\nchosen for the federation has, and of how the aforementioned elements\ninterconnect. To this end, we conduct the first joint\nARchitecture-Initialization-Aggregation study and benchmark ARIAs across a\nrange of medical image classification tasks. We find that, contrary to current\npractices, ARIA elements have to be chosen together to achieve the best\npossible performance. Our results also shed light on good choices for each\nelement depending on the task, the effect of normalisation layers, and the\nutility of SSL pre-training, pointing to potential directions for designing\nFL-specific architectures and training pipelines.\n","authors":["Vasilis Siomos","Sergio Naval-Marimont","Jonathan Passerat-Palmbach","Giacomo Tarroni"],"pdf_url":"https://arxiv.org/pdf/2311.14625v1.pdf","comment":"Under review at the 21st IEEE International Symposium on Biomedical\n Imaging"},{"id":"http://arxiv.org/abs/2311.14617v1","updated":"2023-11-24T17:25:12Z","published":"2023-11-24T17:25:12Z","title":"Neural Style Transfer for Computer Games","summary":" Neural Style Transfer (NST) research has been applied to images, videos, 3D\nmeshes and radiance fields, but its application to 3D computer games remains\nrelatively unexplored. Whilst image and video NST systems can be used as a\npost-processing effect for a computer game, this results in undesired artefacts\nand diminished post-processing effects. Here, we present an approach for\ninjecting depth-aware NST as part of the 3D rendering pipeline. Qualitative and\nquantitative experiments are used to validate our in-game stylisation\nframework. We demonstrate temporally consistent results of artistically\nstylised game scenes, outperforming state-of-the-art image and video NST\nmethods.\n","authors":["Eleftherios Ioannou","Steve Maddock"],"pdf_url":"https://arxiv.org/pdf/2311.14617v1.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.14603v1","updated":"2023-11-24T16:47:05Z","published":"2023-11-24T16:47:05Z","title":"Animate124: Animating One Image to 4D Dynamic Scene","summary":" We introduce Animate124 (Animate-one-image-to-4D), the first work to animate\na single in-the-wild image into 3D video through textual motion descriptions,\nan underexplored problem with significant applications. Our 4D generation\nleverages an advanced 4D grid dynamic Neural Radiance Field (NeRF) model,\noptimized in three distinct stages using multiple diffusion priors. Initially,\na static model is optimized using the reference image, guided by 2D and 3D\ndiffusion priors, which serves as the initialization for the dynamic NeRF.\nSubsequently, a video diffusion model is employed to learn the motion specific\nto the subject. However, the object in the 3D videos tends to drift away from\nthe reference image over time. This drift is mainly due to the misalignment\nbetween the text prompt and the reference image in the video diffusion model.\nIn the final stage, a personalized diffusion prior is therefore utilized to\naddress the semantic drift. As the pioneering image-text-to-4D generation\nframework, our method demonstrates significant advancements over existing\nbaselines, evidenced by comprehensive quantitative and qualitative assessments.\n","authors":["Yuyang Zhao","Zhiwen Yan","Enze Xie","Lanqing Hong","Zhenguo Li","Gim Hee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.14603v1.pdf","comment":"Project Page: https://animate124.github.io"},{"id":"http://arxiv.org/abs/2203.04838v5","updated":"2023-11-24T16:29:19Z","published":"2022-03-09T16:12:08Z","title":"CMX: Cross-Modal Fusion for RGB-X Semantic Segmentation with\n Transformers","summary":" Scene understanding based on image segmentation is a crucial component of\nautonomous vehicles. Pixel-wise semantic segmentation of RGB images can be\nadvanced by exploiting complementary features from the supplementary modality\n(X-modality). However, covering a wide variety of sensors with a\nmodality-agnostic model remains an unresolved problem due to variations in\nsensor characteristics among different modalities. Unlike previous\nmodality-specific methods, in this work, we propose a unified fusion framework,\nCMX, for RGB-X semantic segmentation. To generalize well across different\nmodalities, that often include supplements as well as uncertainties, a unified\ncross-modal interaction is crucial for modality fusion. Specifically, we design\na Cross-Modal Feature Rectification Module (CM-FRM) to calibrate bi-modal\nfeatures by leveraging the features from one modality to rectify the features\nof the other modality. With rectified feature pairs, we deploy a Feature Fusion\nModule (FFM) to perform sufficient exchange of long-range contexts before\nmixing. To verify CMX, for the first time, we unify five modalities\ncomplementary to RGB, i.e., depth, thermal, polarization, event, and LiDAR.\nExtensive experiments show that CMX generalizes well to diverse multi-modal\nfusion, achieving state-of-the-art performances on five RGB-Depth benchmarks,\nas well as RGB-Thermal, RGB-Polarization, and RGB-LiDAR datasets. Besides, to\ninvestigate the generalizability to dense-sparse data fusion, we establish an\nRGB-Event semantic segmentation benchmark based on the EventScape dataset, on\nwhich CMX sets the new state-of-the-art. The source code of CMX is publicly\navailable at https://github.com/huaaaliu/RGBX_Semantic_Segmentation.\n","authors":["Jiaming Zhang","Huayao Liu","Kailun Yang","Xinxin Hu","Ruiping Liu","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2203.04838v5.pdf","comment":"Accepted to IEEE Transactions on Intelligent Transportation Systems\n (T-ITS). The source code of CMX is publicly available at\n https://github.com/huaaaliu/RGBX_Semantic_Segmentation"},{"id":"http://arxiv.org/abs/2304.00933v2","updated":"2023-11-24T16:24:33Z","published":"2023-04-03T12:45:52Z","title":"Knowledge Accumulation in Continually Learned Representations and the\n Issue of Feature Forgetting","summary":" While it is established that neural networks suffer from catastrophic\nforgetting ``at the output level'', it is debated whether this is also the case\nat the level of representations. Some studies ascribe a certain level of innate\nrobustness to representations, that they only forget minimally and no critical\ninformation, while others claim that representations are also severely affected\nby forgetting. To settle this debate, we first discuss how this apparent\ndisagreement might stem from the coexistence of two phenomena that affect the\nquality of continually learned representations: knowledge accumulation and\nfeature forgetting. We then show that, even though it is true that feature\nforgetting can be small in absolute terms, newly learned information is\nforgotten just as catastrophically at the level of representations as it is at\nthe output level. Next we show that this feature forgetting is problematic as\nit substantially slows down knowledge accumulation. We further show that\nrepresentations that are continually learned through both supervised and\nself-supervised learning suffer from feature forgetting. Finally, we study how\nfeature forgetting and knowledge accumulation are affected by different types\nof continual learning methods.\n","authors":["Timm Hess","Eli Verwimp","Gido M. van de Ven","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2304.00933v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06607v2","updated":"2023-11-24T16:21:39Z","published":"2023-11-11T16:37:41Z","title":"Monkey: Image Resolution and Text Label Are Important Things for Large\n Multi-modal Models","summary":" Large Multimodal Models (LMMs) have shown promise in vision-language tasks\nbut struggle with high-resolution input and detailed scene understanding.\nAddressing these challenges, we introduce Monkey to enhance LMM capabilities.\nFirstly, Monkey processes input images by dividing them into uniform patches,\neach matching the size (e.g., 448x448) used in the original training of the\nwell-trained vision encoder. Equipped with individual adapter for each patch,\nMonkey can handle higher resolutions up to 1344x896 pixels, enabling the\ndetailed capture of complex visual information. Secondly, it employs a\nmulti-level description generation method, enriching the context for\nscene-object associations. This two-part strategy ensures more effective\nlearning from generated data: the higher resolution allows for a more detailed\ncapture of visuals, which in turn enhances the effectiveness of comprehensive\ndescriptions. Extensive ablative results validate the effectiveness of our\ndesigns. Additionally, experiments on 18 datasets further demonstrate that\nMonkey surpasses existing LMMs in many tasks like Image Captioning and various\nVisual Question Answering formats. Specially, in qualitative tests focused on\ndense text question answering, Monkey has exhibited encouraging results\ncompared with GPT4V. Code is available at\nhttps://github.com/Yuliang-Liu/Monkey.\n","authors":["Zhang Li","Biao Yang","Qiang Liu","Zhiyin Ma","Shuo Zhang","Jingxu Yang","Yabo Sun","Yuliang Liu","Xiang Bai"],"pdf_url":"https://arxiv.org/pdf/2311.06607v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14580v1","updated":"2023-11-24T16:12:05Z","published":"2023-11-24T16:12:05Z","title":"Large Language Models as Automated Aligners for benchmarking\n Vision-Language Models","summary":" With the advancements in Large Language Models (LLMs), Vision-Language Models\n(VLMs) have reached a new level of sophistication, showing notable competence\nin executing intricate cognition and reasoning tasks. However, existing\nevaluation benchmarks, primarily relying on rigid, hand-crafted datasets to\nmeasure task-specific performance, face significant limitations in assessing\nthe alignment of these increasingly anthropomorphic models with human\nintelligence. In this work, we address the limitations via Auto-Bench, which\ndelves into exploring LLMs as proficient aligners, measuring the alignment\nbetween VLMs and human intelligence and value through automatic data curation\nand assessment. Specifically, for data curation, Auto-Bench utilizes LLMs\n(e.g., GPT-4) to automatically generate a vast set of question-answer-reasoning\ntriplets via prompting on visual symbolic representations (e.g., captions,\nobject locations, instance relationships, and etc.). The curated data closely\nmatches human intent, owing to the extensive world knowledge embedded in LLMs.\nThrough this pipeline, a total of 28.5K human-verified and 3,504K unfiltered\nquestion-answer-reasoning triplets have been curated, covering 4 primary\nabilities and 16 sub-abilities. We subsequently engage LLMs like GPT-3.5 to\nserve as judges, implementing the quantitative and qualitative automated\nassessments to facilitate a comprehensive evaluation of VLMs. Our validation\nresults reveal that LLMs are proficient in both evaluation data curation and\nmodel assessment, achieving an average agreement rate of 85%. We envision\nAuto-Bench as a flexible, scalable, and comprehensive benchmark for evaluating\nthe evolving sophisticated VLMs.\n","authors":["Yuanfeng Ji","Chongjian Ge","Weikai Kong","Enze Xie","Zhengying Liu","Zhengguo Li","Ping Luo"],"pdf_url":"https://arxiv.org/pdf/2311.14580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06031v3","updated":"2023-11-24T15:37:06Z","published":"2023-11-10T12:38:16Z","title":"Diagonal Hierarchical Consistency Learning for Semi-supervised Medical\n Image Segmentation","summary":" Medical image segmentation, which is essential for many clinical\napplications, has achieved almost human-level performance via data-driven deep\nlearning technologies. Nevertheless, its performance is predicated upon the\ncostly process of manually annotating a vast amount of medical images. To this\nend, we propose a novel framework for robust semi-supervised medical image\nsegmentation using diagonal hierarchical consistency learning (DiHC-Net).\nFirst, it is composed of multiple sub-models with identical multi-scale\narchitecture but with distinct sub-layers, such as up-sampling and\nnormalisation layers. Second, with mutual consistency, a novel consistency\nregularisation is enforced between one model's intermediate and final\nprediction and soft pseudo labels from other models in a diagonal hierarchical\nfashion. A series of experiments verifies the efficacy of our simple framework,\noutperforming all previous approaches on public Left Atrium (LA) dataset.\n","authors":["Heejoon Koo"],"pdf_url":"https://arxiv.org/pdf/2311.06031v3.pdf","comment":"5 pages, 2 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2311.14552v1","updated":"2023-11-24T15:35:07Z","published":"2023-11-24T15:35:07Z","title":"Griffon: Spelling out All Object Locations at Any Granularity with Large\n Language Models","summary":" Replicating the innate human ability to detect all objects based on free-form\ntexts at any granularity remains a formidable challenge for Vision-Language\nmodels. Current Large Vision Language Models (LVLMs) are predominantly\nconstrained to grounding a single, pre-existing object, relying solely on data\nfrom Referring Expression Comprehension tasks. The limitation leads to a\ncompromise in model design, necessitating the introduction of visual expert\nmodels or the integration of customized head structures. Beyond these\nconstraints, our research delves into the untapped potential of LVLMs and\nuncover their inherent capability for basic object perception, allowing them to\naccurately identify and locate objects of interest. Building on this insight,\nwe introduce a novel language-prompted localization dataset designed to fully\nunleash the capabilities of LVLMs in integrating fine-grained object perception\nwith precise location awareness. More importantly, we present\n$\\textbf{Griffon}$, a purely LVLM-based baseline, which does not require the\nintroduction of any special tokens, expert models, or additional detection\nmodules. It simply maintains a consistent structure with popular LVLMs by\nunifying data formats across various localization-related scenarios and is\ntrained end-to-end through a well-designed pipeline. Comprehensive experiments\ndemonstrate that $\\textbf{Griffon}$ not only achieves state-of-the-art\nperformance on the fine-grained RefCOCO series but also approaches the\ncapabilities of the expert model Faster RCNN on the detection benchmark MSCOCO.\n","authors":["Yufei Zhan","Yousong Zhu","Zhiyang Chen","Fan Yang","Ming Tang","Jinqiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14552v1.pdf","comment":"Technical report. The codes and dataset will be released soon"},{"id":"http://arxiv.org/abs/2310.09600v2","updated":"2023-11-24T15:29:08Z","published":"2023-10-14T15:20:33Z","title":"Hawkeye: A PyTorch-based Library for Fine-Grained Image Recognition with\n Deep Learning","summary":" Fine-Grained Image Recognition (FGIR) is a fundamental and challenging task\nin computer vision and multimedia that plays a crucial role in Intellectual\nEconomy and Industrial Internet applications. However, the absence of a unified\nopen-source software library covering various paradigms in FGIR poses a\nsignificant challenge for researchers and practitioners in the field. To\naddress this gap, we present Hawkeye, a PyTorch-based library for FGIR with\ndeep learning. Hawkeye is designed with a modular architecture, emphasizing\nhigh-quality code and human-readable configuration, providing a comprehensive\nsolution for FGIR tasks. In Hawkeye, we have implemented 16 state-of-the-art\nfine-grained methods, covering 6 different paradigms, enabling users to explore\nvarious approaches for FGIR. To the best of our knowledge, Hawkeye represents\nthe first open-source PyTorch-based library dedicated to FGIR. It is publicly\navailable at https://github.com/Hawkeye-FineGrained/Hawkeye/, providing\nresearchers and practitioners with a powerful tool to advance their research\nand development in the field of FGIR.\n","authors":["Jiabei He","Yang Shen","Xiu-Shen Wei","Ye Wu"],"pdf_url":"https://arxiv.org/pdf/2310.09600v2.pdf","comment":"ACM Multimedia 2023 Open Source Software Competition Winner Entry.\n X.-S. Wei is the corresponding author"},{"id":"http://arxiv.org/abs/2302.14460v3","updated":"2023-11-24T15:25:26Z","published":"2023-02-28T10:08:11Z","title":"Interpretable and intervenable ultrasonography-based machine learning\n models for pediatric appendicitis","summary":" Appendicitis is among the most frequent reasons for pediatric abdominal\nsurgeries. Previous decision support systems for appendicitis have focused on\nclinical, laboratory, scoring, and computed tomography data and have ignored\nabdominal ultrasound, despite its noninvasive nature and widespread\navailability. In this work, we present interpretable machine learning models\nfor predicting the diagnosis, management and severity of suspected appendicitis\nusing ultrasound images. Our approach utilizes concept bottleneck models (CBM)\nthat facilitate interpretation and interaction with high-level concepts\nunderstandable to clinicians. Furthermore, we extend CBMs to prediction\nproblems with multiple views and incomplete concept sets. Our models were\ntrained on a dataset comprising 579 pediatric patients with 1709 ultrasound\nimages accompanied by clinical and laboratory data. Results show that our\nproposed method enables clinicians to utilize a human-understandable and\nintervenable predictive model without compromising performance or requiring\ntime-consuming image annotation when deployed. For predicting the diagnosis,\nthe extended multiview CBM attained an AUROC of 0.80 and an AUPR of 0.92,\nperforming comparably to similar black-box neural networks trained and tested\non the same dataset.\n","authors":["Ričards Marcinkevičs","Patricia Reis Wolfertstetter","Ugne Klimiene","Kieran Chin-Cheong","Alyssia Paschke","Julia Zerres","Markus Denzinger","David Niederberger","Sven Wellmann","Ece Ozkan","Christian Knorr","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2302.14460v3.pdf","comment":"Published in Medical Image Analysis (Elsevier)"},{"id":"http://arxiv.org/abs/2311.14544v1","updated":"2023-11-24T15:23:47Z","published":"2023-11-24T15:23:47Z","title":"Inferring Latent Class Statistics from Text for Robust Visual Few-Shot\n Learning","summary":" In the realm of few-shot learning, foundation models like CLIP have proven\neffective but exhibit limitations in cross-domain robustness especially in\nfew-shot settings. Recent works add text as an extra modality to enhance the\nperformance of these models. Most of these approaches treat text as an\nauxiliary modality without fully exploring its potential to elucidate the\nunderlying class visual features distribution. In this paper, we present a\nnovel approach that leverages text-derived statistics to predict the mean and\ncovariance of the visual feature distribution for each class. This predictive\nframework enriches the latent space, yielding more robust and generalizable\nfew-shot learning models. We demonstrate the efficacy of incorporating both\nmean and covariance statistics in improving few-shot classification performance\nacross various datasets. Our method shows that we can use text to predict the\nmean and covariance of the distribution offering promising improvements in\nfew-shot learning scenarios.\n","authors":["Yassir Bendou","Vincent Gripon","Bastien Pasdeloup","Giulia Lioi","Lukas Mauch","Fabien Cardinaux","Ghouthi Boukli Hacene"],"pdf_url":"https://arxiv.org/pdf/2311.14544v1.pdf","comment":"R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot Learning in\n Foundation Models at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.14542v1","updated":"2023-11-24T15:20:01Z","published":"2023-11-24T15:20:01Z","title":"ToddlerDiffusion: Flash Interpretable Controllable Diffusion Model","summary":" Diffusion-based generative models excel in perceptually impressive synthesis\nbut face challenges in interpretability. This paper introduces\nToddlerDiffusion, an interpretable 2D diffusion image-synthesis framework\ninspired by the human generation system. Unlike traditional diffusion models\nwith opaque denoising steps, our approach decomposes the generation process\ninto simpler, interpretable stages; generating contours, a palette, and a\ndetailed colored image. This not only enhances overall performance but also\nenables robust editing and interaction capabilities. Each stage is meticulously\nformulated for efficiency and accuracy, surpassing Stable-Diffusion (LDM).\nExtensive experiments on datasets like LSUN-Churches and COCO validate our\napproach, consistently outperforming existing methods. ToddlerDiffusion\nachieves notable efficiency, matching LDM performance on LSUN-Churches while\noperating three times faster with a 3.76 times smaller architecture. Our source\ncode is provided in the supplementary material and will be publicly accessible.\n","authors":["Eslam Mohamed Bakr","Liangbing Zhao","Vincent Tao Hu","Matthieu Cord","Patrick Perez","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2311.14542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14521v1","updated":"2023-11-24T14:46:59Z","published":"2023-11-24T14:46:59Z","title":"GaussianEditor: Swift and Controllable 3D Editing with Gaussian\n Splatting","summary":" 3D editing plays a crucial role in many areas such as gaming and virtual\nreality. Traditional 3D editing methods, which rely on representations like\nmeshes and point clouds, often fall short in realistically depicting complex\nscenes. On the other hand, methods based on implicit 3D representations, like\nNeural Radiance Field (NeRF), render complex scenes effectively but suffer from\nslow processing speeds and limited control over specific scene areas. In\nresponse to these challenges, our paper presents GaussianEditor, an innovative\nand efficient 3D editing algorithm based on Gaussian Splatting (GS), a novel 3D\nrepresentation. GaussianEditor enhances precision and control in editing\nthrough our proposed Gaussian semantic tracing, which traces the editing target\nthroughout the training process. Additionally, we propose Hierarchical Gaussian\nsplatting (HGS) to achieve stabilized and fine results under stochastic\ngenerative guidance from 2D diffusion models. We also develop editing\nstrategies for efficient object removal and integration, a challenging task for\nexisting methods. Our comprehensive experiments demonstrate GaussianEditor's\nsuperior control, efficacy, and rapid performance, marking a significant\nadvancement in 3D editing. Project Page:\nhttps://buaacyw.github.io/gaussian-editor/\n","authors":["Yiwen Chen","Zilong Chen","Chi Zhang","Feng Wang","Xiaofeng Yang","Yikai Wang","Zhongang Cai","Lei Yang","Huaping Liu","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2311.14521v1.pdf","comment":"Project Page: https://buaacyw.github.io/gaussian-editor/"},{"id":"http://arxiv.org/abs/2311.14506v1","updated":"2023-11-24T14:26:07Z","published":"2023-11-24T14:26:07Z","title":"Multi-Class Anomaly Detection based on Regularized Discriminative\n Coupled hypersphere-based Feature Adaptation","summary":" In anomaly detection, identification of anomalies across diverse product\ncategories is a complex task. This paper introduces a new model by including\nclass discriminative properties obtained by a modified Regularized\nDiscriminative Variational Auto-Encoder (RD-VAE) in the feature extraction\nprocess of Coupled-hypersphere-based Feature Adaptation (CFA). By doing so, the\nproposed Regularized Discriminative Coupled-hypersphere-based Feature\nAdaptation (RD-CFA), forms a solution for multi-class anomaly detection. By\nusing the discriminative power of RD-VAE to capture intricate class\ndistributions, combined with CFA's robust anomaly detection capability, the\nproposed method excels in discerning anomalies across various classes.\nExtensive evaluations on multi-class anomaly detection and localization using\nthe MVTec AD and BeanTech AD datasets showcase the effectiveness of RD-CFA\ncompared to eight leading contemporary methods.\n","authors":["Mehdi Rafiei","Alexandros Iosifidis"],"pdf_url":"https://arxiv.org/pdf/2311.14506v1.pdf","comment":"14 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2311.14494v1","updated":"2023-11-24T14:07:53Z","published":"2023-11-24T14:07:53Z","title":"MVControl: Adding Conditional Control to Multi-view Diffusion for\n Controllable Text-to-3D Generation","summary":" We introduce MVControl, a novel neural network architecture that enhances\nexisting pre-trained multi-view 2D diffusion models by incorporating additional\ninput conditions, e.g. edge maps. Our approach enables the generation of\ncontrollable multi-view images and view-consistent 3D content. To achieve\ncontrollable multi-view image generation, we leverage MVDream as our base\nmodel, and train a new neural network module as additional plugin for\nend-to-end task-specific condition learning. To precisely control the shapes\nand views of generated images, we innovatively propose a new conditioning\nmechanism that predicts an embedding encapsulating the input spatial and view\nconditions, which is then injected to the network globally. Once MVControl is\ntrained, score-distillation (SDS) loss based optimization can be performed to\ngenerate 3D content, in which process we propose to use a hybrid diffusion\nprior. The hybrid prior relies on a pre-trained Stable-Diffusion network and\nour trained MVControl for additional guidance. Extensive experiments\ndemonstrate that our method achieves robust generalization and enables the\ncontrollable generation of high-quality 3D content.\n","authors":["Zhiqi Li","Yiming Chen","Lingzhe Zhao","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.14494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14485v1","updated":"2023-11-24T13:48:37Z","published":"2023-11-24T13:48:37Z","title":"Towards Interpretable Classification of Leukocytes based on Deep\n Learning","summary":" Label-free approaches are attractive in cytological imaging due to their\nflexibility and cost efficiency. They are supported by machine learning\nmethods, which, despite the lack of labeling and the associated lower contrast,\ncan classify cells with high accuracy where the human observer has little\nchance to discriminate cells. In order to better integrate these workflows into\nthe clinical decision making process, this work investigates the calibration of\nconfidence estimation for the automated classification of leukocytes. In\naddition, different visual explanation approaches are compared, which should\nbring machine decision making closer to professional healthcare applications.\nFurthermore, we were able to identify general detection patterns in neural\nnetworks and demonstrate the utility of the presented approaches in different\nscenarios of blood cell analysis.\n","authors":["Stefan Röhrl","Johannes Groll","Manuel Lengl","Simon Schumann","Christian Klenk","Dominik Heim","Martin Knopp","Oliver Hayden","Klaus Diepold"],"pdf_url":"https://arxiv.org/pdf/2311.14485v1.pdf","comment":"Presented at the 3rd Workshop on Interpretable Machine Learning in\n Healthcare (IMLH) @ ICML 2023"},{"id":"http://arxiv.org/abs/2311.14482v1","updated":"2023-11-24T13:45:58Z","published":"2023-11-24T13:45:58Z","title":"Sliding Window FastEdit: A Framework for Lesion Annotation in Whole-body\n PET Images","summary":" Deep learning has revolutionized the accurate segmentation of diseases in\nmedical imaging. However, achieving such results requires training with\nnumerous manual voxel annotations. This requirement presents a challenge for\nwhole-body Positron Emission Tomography (PET) imaging, where lesions are\nscattered throughout the body. To tackle this problem, we introduce SW-FastEdit\n- an interactive segmentation framework that accelerates the labeling by\nutilizing only a few user clicks instead of voxelwise annotations. While prior\ninteractive models crop or resize PET volumes due to memory constraints, we use\nthe complete volume with our sliding window-based interactive scheme. Our model\noutperforms existing non-sliding window interactive models on the AutoPET\ndataset and generalizes to the previously unseen HECKTOR dataset. A user study\nrevealed that annotators achieve high-quality predictions with only 10 click\niterations and a low perceived NASA-TLX workload. Our framework is implemented\nusing MONAI Label and is available:\nhttps://github.com/matt3o/AutoPET2-Submission/\n","authors":["Matthias Hadlich","Zdravko Marinov","Moon Kim","Enrico Nasca","Jens Kleesiek","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2311.14482v1.pdf","comment":"5 pages, 2 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.14473v1","updated":"2023-11-24T13:26:53Z","published":"2023-11-24T13:26:53Z","title":"Joint Diffusion: Mutual Consistency-Driven Diffusion Model for PET-MRI\n Co-Reconstruction","summary":" Positron Emission Tomography and Magnetic Resonance Imaging (PET-MRI) systems\ncan obtain functional and anatomical scans. PET suffers from a low\nsignal-to-noise ratio. Meanwhile, the k-space data acquisition process in MRI\nis time-consuming. The study aims to accelerate MRI and enhance PET image\nquality. Conventional approaches involve the separate reconstruction of each\nmodality within PET-MRI systems. However, there exists complementary\ninformation among multi-modal images. The complementary information can\ncontribute to image reconstruction. In this study, we propose a novel PET-MRI\njoint reconstruction model employing a mutual consistency-driven diffusion\nmode, namely MC-Diffusion. MC-Diffusion learns the joint probability\ndistribution of PET and MRI for utilizing complementary information. We\nconducted a series of contrast experiments about LPLS, Joint ISAT-net and\nMC-Diffusion by the ADNI dataset. The results underscore the qualitative and\nquantitative improvements achieved by MC-Diffusion, surpassing the\nstate-of-the-art method.\n","authors":["Taofeng Xie","Zhuo-Xu Cui","Chen Luo","Huayu Wang","Congcong Liu","Yuanzhi Zhang","Xuemei Wang","Yanjie Zhu","Qiyu Jin","Guoqing Chen","Yihang Zhou","Dong Liang","Haifeng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14473v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14471v1","updated":"2023-11-24T13:25:29Z","published":"2023-11-24T13:25:29Z","title":"MRxaI: Black-Box Explainability for Image Classifiers in a Medical\n Setting","summary":" Existing tools for explaining the output of image classifiers can be divided\ninto white-box, which rely on access to the model internals, and black-box,\nagnostic to the model. As the usage of AI in the medical domain grows, so too\ndoes the usage of explainability tools. Existing work on medical image\nexplanations focuses on white-box tools, such as gradcam. However, there are\nclear advantages to switching to a black-box tool, including the ability to use\nit with any classifier and the wide selection of black-box tools available. On\nstandard images, black-box tools are as precise as white-box. In this paper we\ncompare the performance of several black-box methods against gradcam on a brain\ncancer MRI dataset. We demonstrate that most black-box tools are not suitable\nfor explaining medical image classifications and present a detailed analysis of\nthe reasons for their shortcomings. We also show that one black-box tool, a\ncausal explainability-based rex, performs as well as \\gradcam.\n","authors":["Nathan Blake","Hana Chockler","David A. Kelly","Santiago Calderon Pena","Akchunya Chanchal"],"pdf_url":"https://arxiv.org/pdf/2311.14471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14462v1","updated":"2023-11-24T13:14:10Z","published":"2023-11-24T13:14:10Z","title":"CT-xCOV: a CT-scan based Explainable Framework for COVid-19 diagnosis","summary":" In this work, CT-xCOV, an explainable framework for COVID-19 diagnosis using\nDeep Learning (DL) on CT-scans is developed. CT-xCOV adopts an end-to-end\napproach from lung segmentation to COVID-19 detection and explanations of the\ndetection model's prediction. For lung segmentation, we used the well-known\nU-Net model. For COVID-19 detection, we compared three different CNN\narchitectures: a standard CNN, ResNet50, and DenseNet121. After the detection,\nvisual and textual explanations are provided. For visual explanations, we\napplied three different XAI techniques, namely, Grad-Cam, Integrated Gradient\n(IG), and LIME. Textual explanations are added by computing the percentage of\ninfection by lungs. To assess the performance of the used XAI techniques, we\npropose a ground-truth-based evaluation method, measuring the similarity\nbetween the visualization outputs and the ground-truth infections. The\nperformed experiments show that the applied DL models achieved good results.\nThe U-Net segmentation model achieved a high Dice coefficient (98%). The\nperformance of our proposed classification model (standard CNN) was validated\nusing 5-fold cross-validation (acc of 98.40% and f1-score 98.23%). Lastly, the\nresults of the comparison of XAI techniques show that Grad-Cam gives the best\nexplanations compared to LIME and IG, by achieving a Dice coefficient of 55%,\non COVID-19 positive scans, compared to 29% and 24% obtained by IG and LIME\nrespectively. The code and the dataset used in this paper are available in the\nGitHub repository [1].\n","authors":["Ismail Elbouknify","Afaf Bouhoute","Khalid Fardousse","Ismail Berrada","Abdelmajid Badri"],"pdf_url":"https://arxiv.org/pdf/2311.14462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14459v1","updated":"2023-11-24T13:11:36Z","published":"2023-11-24T13:11:36Z","title":"IDD-AW: A Benchmark for Safe and Robust Segmentation of Drive Scenes in\n Unstructured Traffic and Adverse Weather","summary":" Large-scale deployment of fully autonomous vehicles requires a very high\ndegree of robustness to unstructured traffic, and weather conditions, and\nshould prevent unsafe mispredictions. While there are several datasets and\nbenchmarks focusing on segmentation for drive scenes, they are not specifically\nfocused on safety and robustness issues. We introduce the IDD-AW dataset, which\nprovides 5000 pairs of high-quality images with pixel-level annotations,\ncaptured under rain, fog, low light, and snow in unstructured driving\nconditions. As compared to other adverse weather datasets, we provide i.) more\nannotated images, ii.) paired Near-Infrared (NIR) image for each frame, iii.)\nlarger label set with a 4-level label hierarchy to capture unstructured traffic\nconditions. We benchmark state-of-the-art models for semantic segmentation in\nIDD-AW. We also propose a new metric called ''Safe mean Intersection over Union\n(Safe mIoU)'' for hierarchical datasets which penalizes dangerous\nmispredictions that are not captured in the traditional definition of mean\nIntersection over Union (mIoU). The results show that IDD-AW is one of the most\nchallenging datasets to date for these tasks. The dataset and code will be\navailable here: http://iddaw.github.io.\n","authors":["Furqan Ahmed Shaik","Abhishek Malreddy","Nikhil Reddy Billa","Kunal Chaudhary","Sunny Manchanda","Girish Varma"],"pdf_url":"https://arxiv.org/pdf/2311.14459v1.pdf","comment":"8 pages excluding references. Accepted in WACV 2024"},{"id":"http://arxiv.org/abs/2310.19653v2","updated":"2023-11-24T13:02:55Z","published":"2023-10-30T15:38:39Z","title":"Upgrading VAE Training With Unlimited Data Plans Provided by Diffusion\n Models","summary":" Variational autoencoders (VAEs) are popular models for representation\nlearning but their encoders are susceptible to overfitting (Cremer et al.,\n2018) because they are trained on a finite training set instead of the true\n(continuous) data distribution $p_{\\mathrm{data}}(\\mathbf{x})$. Diffusion\nmodels, on the other hand, avoid this issue by keeping the encoder fixed. This\nmakes their representations less interpretable, but it simplifies training,\nenabling accurate and continuous approximations of\n$p_{\\mathrm{data}}(\\mathbf{x})$. In this paper, we show that overfitting\nencoders in VAEs can be effectively mitigated by training on samples from a\npre-trained diffusion model. These results are somewhat unexpected as recent\nfindings (Alemohammad et al., 2023; Shumailov et al., 2023) observe a decay in\ngenerative performance when models are trained on data generated by another\ngenerative model. We analyze generalization performance, amortization gap, and\nrobustness of VAEs trained with our proposed method on three different data\nsets. We find improvements in all metrics compared to both normal training and\nconventional data augmentation methods, and we show that a modest amount of\nsamples from the diffusion model suffices to obtain these gains.\n","authors":["Tim Z. Xiao","Johannes Zenn","Robert Bamler"],"pdf_url":"https://arxiv.org/pdf/2310.19653v2.pdf","comment":"9 pages + appendix"},{"id":"http://arxiv.org/abs/2211.04927v2","updated":"2023-11-24T12:59:12Z","published":"2022-11-09T14:57:27Z","title":"DeepDC: Deep Distance Correlation as a Perceptual Image Quality\n Evaluator","summary":" ImageNet pre-trained deep neural networks (DNNs) show notable transferability\nfor building effective image quality assessment (IQA) models. Such a remarkable\nbyproduct has often been identified as an emergent property in previous\nstudies. In this work, we attribute such capability to the intrinsic\ntexture-sensitive characteristic that classifies images using texture features.\nWe fully exploit this characteristic to develop a novel full-reference IQA\n(FR-IQA) model based exclusively on pre-trained DNN features. Specifically, we\ncompute the distance correlation, a highly promising yet relatively\nunder-investigated statistic, between reference and distorted images in the\ndeep feature domain. In addition, the distance correlation quantifies both\nlinear and nonlinear feature relationships, which is far beyond the widely used\nfirst-order and second-order statistics in the feature space. We conduct\ncomprehensive experiments to demonstrate the superiority of the proposed\nquality model on five standard IQA datasets, one perceptual similarity dataset,\ntwo texture similarity datasets, and one geometric transformation dataset.\nMoreover, we optimize the proposed model to generate a broad spectrum of\ntexture patterns, by treating the model as the style loss function for neural\nstyle transfer (NST). Extensive experiments demonstrate that the proposed\ntexture synthesis and NST methods achieve the best quantitative and qualitative\nresults. We release our code at https://github.com/h4nwei/DeepDC.\n","authors":["Hanwei Zhu","Baoliang Chen","Lingyu Zhu","Shiqi Wang","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2211.04927v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14450v1","updated":"2023-11-24T12:57:34Z","published":"2023-11-24T12:57:34Z","title":"Segment (Almost) Nothing: Prompt-Agnostic Adversarial Attacks on\n Segmentation Models","summary":" General purpose segmentation models are able to generate (semantic)\nsegmentation masks from a variety of prompts, including visual (points, boxed,\netc.) and textual (object names) ones. In particular, input images are\npre-processed by an image encoder to obtain embedding vectors which are later\nused for mask predictions. Existing adversarial attacks target the end-to-end\ntasks, i.e. aim at altering the segmentation mask predicted for a specific\nimage-prompt pair. However, this requires running an individual attack for each\nnew prompt for the same image. We propose instead to generate prompt-agnostic\nadversarial attacks by maximizing the $\\ell_2$-distance, in the latent space,\nbetween the embedding of the original and perturbed images. Since the encoding\nprocess only depends on the image, distorted image representations will cause\nperturbations in the segmentation masks for a variety of prompts. We show that\neven imperceptible $\\ell_\\infty$-bounded perturbations of radius\n$\\epsilon=1/255$ are often sufficient to drastically modify the masks predicted\nwith point, box and text prompts by recently proposed foundation models for\nsegmentation. Moreover, we explore the possibility of creating universal, i.e.\nnon image-specific, attacks which can be readily applied to any input without\nfurther computational cost.\n","authors":["Francesco Croce","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2311.14450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14435v1","updated":"2023-11-24T12:22:00Z","published":"2023-11-24T12:22:00Z","title":"GCPV: Guided Concept Projection Vectors for the Explainable Inspection\n of CNN Feature Spaces","summary":" For debugging and verification of computer vision convolutional deep neural\nnetworks (CNNs) human inspection of the learned latent representations is\nimperative. Therefore, state-of-the-art eXplainable Artificial Intelligence\n(XAI) methods globally associate given natural language semantic concepts with\nrepresenting vectors or regions in the CNN latent space supporting manual\ninspection. Yet, this approach comes with two major disadvantages: They are\nlocally inaccurate when reconstructing a concept label and discard information\nabout the distribution of concept instance representations. The latter, though,\nis of particular interest for debugging, like finding and understanding\noutliers, learned notions of sub-concepts, and concept confusion. Furthermore,\ncurrent single-layer approaches neglect that information about a concept may be\nspread over the CNN depth. To overcome these shortcomings, we introduce the\nlocal-to-global Guided Concept Projection Vectors (GCPV) approach: It (1)\ngenerates local concept vectors that each precisely reconstruct a concept\nsegmentation label, and then (2) generalizes these to global concept and even\nsub-concept vectors by means of hiearchical clustering. Our experiments on\nobject detectors demonstrate improved performance compared to the\nstate-of-the-art, the benefit of multi-layer concept vectors, and robustness\nagainst low-quality concept segmentation labels. Finally, we demonstrate that\nGCPVs can be applied to find root causes for confusion of concepts like bus and\ntruck, and reveal interesting concept-level outliers. Thus, GCPVs pose a\npromising step towards interpretable model debugging and informed data\nimprovement.\n","authors":["Georgii Mikriukov","Gesina Schwalbe","Christian Hellert","Korinna Bade"],"pdf_url":"https://arxiv.org/pdf/2311.14435v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09178v3","updated":"2023-11-24T11:47:25Z","published":"2023-11-15T18:15:30Z","title":"RBPGAN: Recurrent Back-Projection GAN for Video Super Resolution","summary":" Recently, video super resolution (VSR) has become a very impactful task in\nthe area of Computer Vision due to its various applications. In this paper, we\npropose Recurrent Back-Projection Generative Adversarial Network (RBPGAN) for\nVSR in an attempt to generate temporally coherent solutions while preserving\nspatial details. RBPGAN integrates two state-of-the-art models to get the best\nin both worlds without compromising the accuracy of produced video. The\ngenerator of the model is inspired by RBPN system, while the discriminator is\ninspired by TecoGAN. We also utilize Ping-Pong loss to increase temporal\nconsistency over time. Our contribution together results in a model that\noutperforms earlier work in terms of temporally consistent details, as we will\ndemonstrate qualitatively and quantitatively using different datasets.\n","authors":["Marwah Sulaiman","Zahraa Shehabeldin","Israa Fahmy","Mohammed Barakat","Mohammed El-Naggar","Dareen Hussein","Moustafa Youssef","Hesham Eraqi"],"pdf_url":"https://arxiv.org/pdf/2311.09178v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08745v2","updated":"2023-11-24T11:18:10Z","published":"2023-09-15T20:16:17Z","title":"Improved Breast Cancer Diagnosis through Transfer Learning on\n Hematoxylin and Eosin Stained Histology Images","summary":" Breast cancer is one of the leading causes of death for women worldwide.\nEarly screening is essential for early identification, but the chance of\nsurvival declines as the cancer progresses into advanced stages. For this\nstudy, the most recent BRACS dataset of histological (H\\&E) stained images was\nused to classify breast cancer tumours, which contains both the whole-slide\nimages (WSI) and region-of-interest (ROI) images, however, for our study we\nhave considered ROI images. We have experimented using different pre-trained\ndeep learning models, such as Xception, EfficientNet, ResNet50, and\nInceptionResNet, pre-trained on the ImageNet weights. We pre-processed the\nBRACS ROI along with image augmentation, upsampling, and dataset split\nstrategies. For the default dataset split, the best results were obtained by\nResNet50 achieving 66% f1-score. For the custom dataset split, the best results\nwere obtained by performing upsampling and image augmentation which results in\n96.2% f1-score. Our second approach also reduced the number of false positive\nand false negative classifications to less than 3% for each class. We believe\nthat our study significantly impacts the early diagnosis and identification of\nbreast cancer tumors and their subtypes, especially atypical and malignant\ntumors, thus improving patient outcomes and reducing patient mortality rates.\nOverall, this study has primarily focused on identifying seven (7) breast\ncancer tumor subtypes, and we believe that the experimental models can be\nfine-tuned further to generalize over previous breast cancer histology datasets\nas well.\n","authors":["Fahad Ahmed","Reem Abdel-Salam","Leon Hamnett","Mary Adewunmi","Temitope Ayano"],"pdf_url":"https://arxiv.org/pdf/2309.08745v2.pdf","comment":"12 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2311.14414v1","updated":"2023-11-24T11:14:39Z","published":"2023-11-24T11:14:39Z","title":"Deformable multi-modal image registration for the correlation between\n optical measurements and histology images","summary":" The correlation of optical measurements with a correct pathology label is\noften hampered by imprecise registration caused by deformations in histology\nimages. This study explores an automated multi-modal image registration\ntechnique utilizing deep learning principles to align snapshot breast specimen\nimages with corresponding histology images. The input images, acquired through\ndifferent modalities, present challenges due to variations in intensities and\nstructural visibility, making linear assumptions inappropriate. An unsupervised\nand supervised learning approach, based on the VoxelMorph model, was explored,\nmaking use of a dataset with manually registered images used as ground truth.\nEvaluation metrics, including Dice scores and mutual information, reveal that\nthe unsupervised model outperforms the supervised (and manual approach)\nsignificantly, achieving superior image alignment. This automated registration\napproach holds promise for improving the validation of optical technologies by\nminimizing human errors and inconsistencies associated with manual\nregistration.\n","authors":["Lianne Feenstra","Maud Lambregts","Theo J. M Ruers","Behdad Dashtbozorg"],"pdf_url":"https://arxiv.org/pdf/2311.14414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14405v1","updated":"2023-11-24T10:56:27Z","published":"2023-11-24T10:56:27Z","title":"OneFormer3D: One Transformer for Unified Point Cloud Segmentation","summary":" Semantic, instance, and panoptic segmentation of 3D point clouds have been\naddressed using task-specific models of distinct design. Thereby, the\nsimilarity of all segmentation tasks and the implicit relationship between them\nhave not been utilized effectively. This paper presents a unified, simple, and\neffective model addressing all these tasks jointly. The model, named\nOneFormer3D, performs instance and semantic segmentation consistently, using a\ngroup of learnable kernels, where each kernel is responsible for generating a\nmask for either an instance or a semantic category. These kernels are trained\nwith a transformer-based decoder with unified instance and semantic queries\npassed as an input. Such a design enables training a model end-to-end in a\nsingle run, so that it achieves top performance on all three segmentation tasks\nsimultaneously. Specifically, our OneFormer3D ranks 1st and sets a new\nstate-of-the-art (+2.1 mAP50) in the ScanNet test leaderboard. We also\ndemonstrate the state-of-the-art results in semantic, instance, and panoptic\nsegmentation of ScanNet (+21 PQ), ScanNet200 (+3.8 mAP50), and S3DIS (+0.8\nmIoU) datasets.\n","authors":["Maxim Kolodiazhnyi","Anna Vorontsova","Anton Konushin","Danila Rukhovich"],"pdf_url":"https://arxiv.org/pdf/2311.14405v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14395v1","updated":"2023-11-24T10:23:57Z","published":"2023-11-24T10:23:57Z","title":"Multi-scale Semantic Correlation Mining for Visible-Infrared Person\n Re-Identification","summary":" The main challenge in the Visible-Infrared Person Re-Identification (VI-ReID)\ntask lies in how to extract discriminative features from different modalities\nfor matching purposes. While the existing well works primarily focus on\nminimizing the modal discrepancies, the modality information can not thoroughly\nbe leveraged. To solve this problem, a Multi-scale Semantic Correlation Mining\nnetwork (MSCMNet) is proposed to comprehensively exploit semantic features at\nmultiple scales and simultaneously reduce modality information loss as small as\npossible in feature extraction. The proposed network contains three novel\ncomponents. Firstly, after taking into account the effective utilization of\nmodality information, the Multi-scale Information Correlation Mining Block\n(MIMB) is designed to explore semantic correlations across multiple scales.\nSecondly, in order to enrich the semantic information that MIMB can utilize, a\nquadruple-stream feature extractor (QFE) with non-shared parameters is\nspecifically designed to extract information from different dimensions of the\ndataset. Finally, the Quadruple Center Triplet Loss (QCT) is further proposed\nto address the information discrepancy in the comprehensive features. Extensive\nexperiments on the SYSU-MM01, RegDB, and LLCM datasets demonstrate that the\nproposed MSCMNet achieves the greatest accuracy.\n","authors":["Ke Cheng","Xuecheng Hua","Hu Lu","Juanjuan Tu","Yuanquan Wang","Shitong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14388v1","updated":"2023-11-24T10:07:14Z","published":"2023-11-24T10:07:14Z","title":"A Parameterized Generative Adversarial Network Using Cyclic Projection\n for Explainable Medical Image Classification","summary":" Although current data augmentation methods are successful to alleviate the\ndata insufficiency, conventional augmentation are primarily intra-domain while\nadvanced generative adversarial networks (GANs) generate images remaining\nuncertain, particularly in small-scale datasets. In this paper, we propose a\nparameterized GAN (ParaGAN) that effectively controls the changes of synthetic\nsamples among domains and highlights the attention regions for downstream\nclassification. Specifically, ParaGAN incorporates projection distance\nparameters in cyclic projection and projects the source images to the decision\nboundary to obtain the class-difference maps. Our experiments show that ParaGAN\ncan consistently outperform the existing augmentation methods with explainable\nclassification on two small-scale medical datasets.\n","authors":["Xiangyu Xiong","Yue Sun","Xiaohong Liu","ChanTong Lam","Tong Tong","Hao Chen","Qinquan Gao","Wei Ke","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2311.14388v1.pdf","comment":"5 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2310.17294v2","updated":"2023-11-24T09:47:06Z","published":"2023-10-26T10:18:51Z","title":"Scale-Adaptive Feature Aggregation for Efficient Space-Time Video\n Super-Resolution","summary":" The Space-Time Video Super-Resolution (STVSR) task aims to enhance the visual\nquality of videos, by simultaneously performing video frame interpolation (VFI)\nand video super-resolution (VSR). However, facing the challenge of the\nadditional temporal dimension and scale inconsistency, most existing STVSR\nmethods are complex and inflexible in dynamically modeling different motion\namplitudes. In this work, we find that choosing an appropriate processing scale\nachieves remarkable benefits in flow-based feature propagation. We propose a\nnovel Scale-Adaptive Feature Aggregation (SAFA) network that adaptively selects\nsub-networks with different processing scales for individual samples.\nExperiments on four public STVSR benchmarks demonstrate that SAFA achieves\nstate-of-the-art performance. Our SAFA network outperforms recent\nstate-of-the-art methods such as TMNet and VideoINR by an average improvement\nof over 0.5dB on PSNR, while requiring less than half the number of parameters\nand only 1/3 computational costs.\n","authors":["Zhewei Huang","Ailin Huang","Xiaotao Hu","Chen Hu","Jun Xu","Shuchang Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.17294v2.pdf","comment":"WACV2024, 16 pages"},{"id":"http://arxiv.org/abs/2207.01072v2","updated":"2023-11-24T09:42:45Z","published":"2022-07-03T16:06:04Z","title":"Dynamic Sub-Cluster-Aware Network for Few-Shot Skin Disease\n Classification","summary":" This paper addresses the problem of few-shot skin disease classification by\nintroducing a novel approach called the Sub-Cluster-Aware Network (SCAN) that\nenhances accuracy in diagnosing rare skin diseases. The key insight motivating\nthe design of SCAN is the observation that skin disease images within a class\noften exhibit multiple sub-clusters, characterized by distinct variations in\nappearance. To improve the performance of few-shot learning, we focus on\nlearning a high-quality feature encoder that captures the unique sub-clustered\nrepresentations within each disease class, enabling better characterization of\nfeature distributions. Specifically, SCAN follows a dual-branch framework,\nwhere the first branch learns class-wise features to distinguish different skin\ndiseases, and the second branch aims to learn features which can effectively\npartition each class into several groups so as to preserve the sub-clustered\nstructure within each class. To achieve the objective of the second branch, we\npresent a cluster loss to learn image similarities via unsupervised clustering.\nTo ensure that the samples in each sub-cluster are from the same class, we\nfurther design a purity loss to refine the unsupervised clustering results. We\nevaluate the proposed approach on two public datasets for few-shot skin disease\nclassification. The experimental results validate that our framework\noutperforms the state-of-the-art methods by around 2% to 5% in terms of\nsensitivity, specificity, accuracy, and F1-score on the SD-198 and Derm7pt\ndatasets.\n","authors":["Shuhan LI","Xiaomeng Li","Xiaowei Xu","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2207.01072v2.pdf","comment":"Accepted by TNNLS 2023"},{"id":"http://arxiv.org/abs/2311.13110v2","updated":"2023-11-24T09:18:44Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v2.pdf","comment":"This paper integrates the works arXiv:2306.01129 and arXiv:2308.16271\n into a complete story. In this paper, we improve the writing and\n organization, and also add conceptual, empirical, and theoretical\n improvements over the previous work. V2: small typo fixes and formatting\n improvements"},{"id":"http://arxiv.org/abs/2311.12401v2","updated":"2023-11-24T08:51:13Z","published":"2023-11-21T07:28:51Z","title":"CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal\n Relationships","summary":" Integrating deep learning and causal discovery has increased the\ninterpretability of Temporal Action Segmentation (TAS) tasks. However,\nframe-level causal relationships exist many complicated noises outside the\nsegment-level, making it infeasible to directly express macro action semantics.\nThus, we propose Causal Abstraction Segmentation Refiner (CASR), which can\nrefine TAS results from various models by enhancing video causality in\nmarginalizing frame-level casual relationships. Specifically, we define the\nequivalent frame-level casual model and segment-level causal model, so that the\ncausal adjacency matrix constructed from marginalized frame-level causal\nrelationships has the ability to represent the segmnet-level causal\nrelationships. CASR works out by reducing the difference in the causal\nadjacency matrix between we constructed and pre-segmentation results of\nbackbone models. In addition, we propose a novel evaluation metric Causal Edit\nDistance (CED) to evaluate the causal interpretability. Extensive experimental\nresults on mainstream datasets indicate that CASR significantly surpasses\nexisting various methods in action segmentation performance, as well as in\ncausal explainability and generalization.\n","authors":["Keqing Du","Xinyu Yang","Hang Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12401v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14343v1","updated":"2023-11-24T08:38:19Z","published":"2023-11-24T08:38:19Z","title":"Highly Detailed and Temporal Consistent Video Stylization via\n Synchronized Multi-Frame Diffusion","summary":" Text-guided video-to-video stylization transforms the visual appearance of a\nsource video to a different appearance guided on textual prompts. Existing\ntext-guided image diffusion models can be extended for stylized video\nsynthesis. However, they struggle to generate videos with both highly detailed\nappearance and temporal consistency. In this paper, we propose a synchronized\nmulti-frame diffusion framework to maintain both the visual details and the\ntemporal consistency. Frames are denoised in a synchronous fashion, and more\nimportantly, information of different frames is shared since the beginning of\nthe denoising process. Such information sharing ensures that a consensus, in\nterms of the overall structure and color distribution, among frames can be\nreached in the early stage of the denoising process before it is too late. The\noptical flow from the original video serves as the connection, and hence the\nvenue for information sharing, among frames. We demonstrate the effectiveness\nof our method in generating high-quality and diverse results in extensive\nexperiments. Our method shows superior qualitative and quantitative results\ncompared to state-of-the-art video editing methods.\n","authors":["Minshan Xie","Hanyuan Liu","Chengze Li","Tien-Tsin Wong"],"pdf_url":"https://arxiv.org/pdf/2311.14343v1.pdf","comment":"11 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.14339v1","updated":"2023-11-24T08:31:34Z","published":"2023-11-24T08:31:34Z","title":"Towards Concept-based Interpretability of Skin Lesion Diagnosis using\n Vision-Language Models","summary":" Concept-based models naturally lend themselves to the development of\ninherently interpretable skin lesion diagnosis, as medical experts make\ndecisions based on a set of visual patterns of the lesion. Nevertheless, the\ndevelopment of these models depends on the existence of concept-annotated\ndatasets, whose availability is scarce due to the specialized knowledge and\nexpertise required in the annotation process. In this work, we show that\nvision-language models can be used to alleviate the dependence on a large\nnumber of concept-annotated samples. In particular, we propose an embedding\nlearning strategy to adapt CLIP to the downstream task of skin lesion\nclassification using concept-based descriptions as textual embeddings. Our\nexperiments reveal that vision-language models not only attain better accuracy\nwhen using concepts as textual embeddings, but also require a smaller number of\nconcept-annotated samples to attain comparable performance to approaches\nspecifically devised for automatic concept generation.\n","authors":["Cristiano Patrício","Luís F. Teixeira","João C. Neves"],"pdf_url":"https://arxiv.org/pdf/2311.14339v1.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2311.14337v1","updated":"2023-11-24T08:24:31Z","published":"2023-11-24T08:24:31Z","title":"TVT: Training-Free Vision Transformer Search on Tiny Datasets","summary":" Training-free Vision Transformer (ViT) architecture search is presented to\nsearch for a better ViT with zero-cost proxies. While ViTs achieve significant\ndistillation gains from CNN teacher models on small datasets, the current\nzero-cost proxies in ViTs do not generalize well to the distillation training\nparadigm according to our experimental observations. In this paper, for the\nfirst time, we investigate how to search in a training-free manner with the\nhelp of teacher models and devise an effective Training-free ViT (TVT) search\nframework. Firstly, we observe that the similarity of attention maps between\nViT and ConvNet teachers affects distill accuracy notably. Thus, we present a\nteacher-aware metric conditioned on the feature attention relations between\nteacher and student. Additionally, TVT employs the L2-Norm of the student's\nweights as the student-capability metric to improve ranking consistency.\nFinally, TVT searches for the best ViT for distilling with ConvNet teachers via\nour teacher-aware metric and student-capability metric, resulting in impressive\ngains in efficiency and effectiveness. Extensive experiments on various tiny\ndatasets and search spaces show that our TVT outperforms state-of-the-art\ntraining-free search methods. The code will be released.\n","authors":["Zimian Wei","Hengyue Pan","Lujun Li","Peijie Dong","Zhiliang Tian","Xin Niu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2311.14337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14334v1","updated":"2023-11-24T08:16:10Z","published":"2023-11-24T08:16:10Z","title":"Maximizing Discrimination Capability of Knowledge Distillation with\n Energy-based Score","summary":" To apply the latest computer vision techniques that require a large\ncomputational cost in real industrial applications, knowledge distillation\nmethods (KDs) are essential. Existing logit-based KDs apply the constant\ntemperature scaling to all samples in dataset, limiting the utilization of\nknowledge inherent in each sample individually. In our approach, we classify\nthe dataset into two categories (i.e., low energy and high energy samples)\nbased on their energy score. Through experiments, we have confirmed that low\nenergy samples exhibit high confidence scores, indicating certain predictions,\nwhile high energy samples yield low confidence scores, meaning uncertain\npredictions. To distill optimal knowledge by adjusting non-target class\npredictions, we apply a higher temperature to low energy samples to create\nsmoother distributions and a lower temperature to high energy samples to\nachieve sharper distributions. When compared to previous logit-based and\nfeature-based methods, our energy-based KD (Energy KD) achieves better\nperformance on various datasets. Especially, Energy KD shows significant\nimprovements on CIFAR-100-LT and ImageNet datasets, which contain many\nchallenging samples. Furthermore, we propose high energy-based data\naugmentation (HE-DA) for further improving the performance. We demonstrate that\nmeaningful performance improvement could be achieved by augmenting only 20-50%\nof dataset, suggesting that it can be employed on resource-limited devices. To\nthe best of our knowledge, this paper represents the first attempt to make use\nof energy scores in KD and DA, and we believe it will greatly contribute to\nfuture research.\n","authors":["Seonghak Kim","Gyeongdo Ham","Suin Lee","Donggon Jang","Daeshik Kim"],"pdf_url":"https://arxiv.org/pdf/2311.14334v1.pdf","comment":"22 pages, 4 figures. This work has been submitted to the Elsevier for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2212.13766v2","updated":"2023-11-24T08:11:59Z","published":"2022-12-28T10:08:55Z","title":"OVO: One-shot Vision Transformer Search with Online distillation","summary":" Pure transformers have shown great potential for vision tasks recently.\nHowever, their accuracy in small or medium datasets is not satisfactory.\nAlthough some existing methods introduce a CNN as a teacher to guide the\ntraining process by distillation, the gap between teacher and student networks\nwould lead to sub-optimal performance. In this work, we propose a new One-shot\nVision transformer search framework with Online distillation, namely OVO. OVO\nsamples sub-nets for both teacher and student networks for better distillation\nresults. Benefiting from the online distillation, thousands of subnets in the\nsupernet are well-trained without extra finetuning or retraining. In\nexperiments, OVO-Ti achieves 73.32% top-1 accuracy on ImageNet and 75.2% on\nCIFAR-100, respectively.\n","authors":["Zimian Wei","Hengyue Pan","Xin Niu","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2212.13766v2.pdf","comment":"The work is not implemented"},{"id":"http://arxiv.org/abs/2311.14323v1","updated":"2023-11-24T07:51:50Z","published":"2023-11-24T07:51:50Z","title":"Binarized 3D Whole-body Human Mesh Recovery","summary":" 3D whole-body human mesh recovery aims to reconstruct the 3D human body,\nface, and hands from a single image. Although powerful deep learning models\nhave achieved accurate estimation in this task, they require enormous memory\nand computational resources. Consequently, these methods can hardly be deployed\non resource-limited edge devices. In this work, we propose a Binarized Dual\nResidual Network (BiDRN), a novel quantization method to estimate the 3D human\nbody, face, and hands parameters efficiently. Specifically, we design a basic\nunit Binarized Dual Residual Block (BiDRB) composed of Local Convolution\nResidual (LCR) and Block Residual (BR), which can preserve full-precision\ninformation as much as possible. For LCR, we generalize it to four kinds of\nconvolutional modules so that full-precision information can be propagated even\nbetween mismatched dimensions. We also binarize the face and hands\nbox-prediction network as Binaried BoxNet, which can further reduce the model\nredundancy. Comprehensive quantitative and qualitative experiments demonstrate\nthe effectiveness of BiDRN, which has a significant improvement over\nstate-of-the-art binarization algorithms. Moreover, our proposed BiDRN achieves\ncomparable performance with full-precision method Hand4Whole while using just\n22.1% parameters and 14.8% operations. We will release all the code and\npretrained models.\n","authors":["Zhiteng Li","Yulun Zhang","Jing Lin","Haotong Qin","Jinjin Gu","Xin Yuan","Linghe Kong","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.14323v1.pdf","comment":"The code will be available at https://github.com/ZHITENGLI/BiDRN"},{"id":"http://arxiv.org/abs/2309.02301v2","updated":"2023-11-24T07:07:03Z","published":"2023-09-05T15:06:37Z","title":"CIEM: Contrastive Instruction Evaluation Method for Better Instruction\n Tuning","summary":" Nowadays, the research on Large Vision-Language Models (LVLMs) has been\nsignificantly promoted thanks to the success of Large Language Models (LLM).\nNevertheless, these Vision-Language Models (VLMs) are suffering from the\ndrawback of hallucination -- due to insufficient understanding of vision and\nlanguage modalities, VLMs may generate incorrect perception information when\ndoing downstream applications, for example, captioning a non-existent entity.\nTo address the hallucination phenomenon, on the one hand, we introduce a\nContrastive Instruction Evaluation Method (CIEM), which is an automatic\npipeline that leverages an annotated image-text dataset coupled with an LLM to\ngenerate factual/contrastive question-answer pairs for the evaluation of the\nhallucination of VLMs. On the other hand, based on CIEM, we further propose a\nnew instruction tuning method called CIT (the abbreviation of Contrastive\nInstruction Tuning) to alleviate the hallucination of VLMs by automatically\nproducing high-quality factual/contrastive question-answer pairs and\ncorresponding justifications for model tuning. Through extensive experiments on\nCIEM and CIT, we pinpoint the hallucination issues commonly present in existing\nVLMs, the disability of the current instruction-tuning dataset to handle the\nhallucination phenomenon and the superiority of CIT-tuned VLMs over both CIEM\nand public datasets.\n","authors":["Hongyu Hu","Jiyuan Zhang","Minyi Zhao","Zhenbang Sun"],"pdf_url":"https://arxiv.org/pdf/2309.02301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12070v3","updated":"2023-11-24T07:00:54Z","published":"2023-06-21T07:43:23Z","title":"Task-Robust Pre-Training for Worst-Case Downstream Adaptation","summary":" Pre-training has achieved remarkable success when transferred to downstream\ntasks. In machine learning, we care about not only the good performance of a\nmodel but also its behavior under reasonable shifts of condition. The same\nphilosophy holds when pre-training a foundation model. However, the foundation\nmodel may not uniformly behave well for a series of related downstream tasks.\nThis happens, for example, when conducting mask recovery regression where the\nrecovery ability or the training instances diverge like pattern features are\nextracted dominantly on pre-training, but semantic features are also required\non a downstream task. This paper considers pre-training a model that guarantees\na uniformly good performance over the downstream tasks. We call this goal as\n$\\textit{downstream-task robustness}$. Our method first separates the upstream\ntask into several representative ones and applies a simple minimax loss for\npre-training. We then design an efficient algorithm to solve the minimax loss\nand prove its convergence in the convex setting. In the experiments, we show\nboth on large-scale natural language processing and computer vision datasets\nour method increases the metrics on worse-case downstream tasks. Additionally,\nsome theoretical explanations for why our loss is beneficial are provided.\nSpecifically, we show fewer samples are inherently required for the most\nchallenging downstream task in some cases.\n","authors":["Jianghui Wang","Yang Chen","Xingyu Xie","Cong Fang","Zhouchen Lin"],"pdf_url":"https://arxiv.org/pdf/2306.12070v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14310v1","updated":"2023-11-24T06:43:26Z","published":"2023-11-24T06:43:26Z","title":"Stable Cluster Discrimination for Deep Clustering","summary":" Deep clustering can optimize representations of instances (i.e.,\nrepresentation learning) and explore the inherent data distribution (i.e.,\nclustering) simultaneously, which demonstrates a superior performance over\nconventional clustering methods with given features. However, the coupled\nobjective implies a trivial solution that all instances collapse to the uniform\nfeatures. To tackle the challenge, a two-stage training strategy is developed\nfor decoupling, where it introduces an additional pre-training stage for\nrepresentation learning and then fine-tunes the obtained model for clustering.\nMeanwhile, one-stage methods are developed mainly for representation learning\nrather than clustering, where various constraints for cluster assignments are\ndesigned to avoid collapsing explicitly. Despite the success of these methods,\nan appropriate learning objective tailored for deep clustering has not been\ninvestigated sufficiently. In this work, we first show that the prevalent\ndiscrimination task in supervised learning is unstable for one-stage clustering\ndue to the lack of ground-truth labels and positive instances for certain\nclusters in each mini-batch. To mitigate the issue, a novel stable cluster\ndiscrimination (SeCu) task is proposed and a new hardness-aware clustering\ncriterion can be obtained accordingly. Moreover, a global entropy constraint\nfor cluster assignments is studied with efficient optimization. Extensive\nexperiments are conducted on benchmark data sets and ImageNet. SeCu achieves\nstate-of-the-art performance on all of them, which demonstrates the\neffectiveness of one-stage deep clustering. Code is available at\n\\url{https://github.com/idstcv/SeCu}.\n","authors":["Qi Qian"],"pdf_url":"https://arxiv.org/pdf/2311.14310v1.pdf","comment":"accepted by ICCV'23"},{"id":"http://arxiv.org/abs/2311.14307v1","updated":"2023-11-24T06:34:47Z","published":"2023-11-24T06:34:47Z","title":"Cosine Similarity Knowledge Distillation for Individual Class\n Information Transfer","summary":" Previous logits-based Knowledge Distillation (KD) have utilized predictions\nabout multiple categories within each sample (i.e., class predictions) and have\nemployed Kullback-Leibler (KL) divergence to reduce the discrepancy between the\nstudent and teacher predictions. Despite the proliferation of KD techniques,\nthe student model continues to fall short of achieving a similar level as\nteachers. In response, we introduce a novel and effective KD method capable of\nachieving results on par with or superior to the teacher models performance. We\nutilize teacher and student predictions about multiple samples for each\ncategory (i.e., batch predictions) and apply cosine similarity, a commonly used\ntechnique in Natural Language Processing (NLP) for measuring the resemblance\nbetween text embeddings. This metric's inherent scale-invariance property,\nwhich relies solely on vector direction and not magnitude, allows the student\nto dynamically learn from the teacher's knowledge, rather than being bound by a\nfixed distribution of the teacher's knowledge. Furthermore, we propose a method\ncalled cosine similarity weighted temperature (CSWT) to improve the\nperformance. CSWT reduces the temperature scaling in KD when the cosine\nsimilarity between the student and teacher models is high, and conversely, it\nincreases the temperature scaling when the cosine similarity is low. This\nadjustment optimizes the transfer of information from the teacher to the\nstudent model. Extensive experimental results show that our proposed method\nserves as a viable alternative to existing methods. We anticipate that this\napproach will offer valuable insights for future research on model compression.\n","authors":["Gyeongdo Ham","Seonghak Kim","Suin Lee","Jae-Hyeok Lee","Daeshik Kim"],"pdf_url":"https://arxiv.org/pdf/2311.14307v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.14301v1","updated":"2023-11-24T06:22:38Z","published":"2023-11-24T06:22:38Z","title":"GeoViT: A Versatile Vision Transformer Architecture for Geospatial Image\n Analysis","summary":" Greenhouse gases are pivotal drivers of climate change, necessitating precise\nquantification and source identification to foster mitigation strategies. We\nintroduce GeoViT, a compact vision transformer model adept in processing\nsatellite imagery for multimodal segmentation, classification, and regression\ntasks targeting CO2 and NO2 emissions. Leveraging GeoViT, we attain superior\naccuracy in estimating power generation rates, fuel type, plume coverage for\nCO2, and high-resolution NO2 concentration mapping, surpassing previous\nstate-of-the-art models while significantly reducing model size. GeoViT\ndemonstrates the efficacy of vision transformer architectures in harnessing\nsatellite-derived data for enhanced GHG emission insights, proving instrumental\nin advancing climate change monitoring and emission regulation efforts\nglobally.\n","authors":["Madhav Khirwar","Ankur Narang"],"pdf_url":"https://arxiv.org/pdf/2311.14301v1.pdf","comment":"Extended Abstract, Preprint"},{"id":"http://arxiv.org/abs/2311.14294v1","updated":"2023-11-24T06:08:27Z","published":"2023-11-24T06:08:27Z","title":"Decouple Content and Motion for Conditional Image-to-Video Generation","summary":" The goal of conditional image-to-video (cI2V) generation is to create a\nbelievable new video by beginning with the condition, i.e., one image and\ntext.The previous cI2V generation methods conventionally perform in RGB pixel\nspace, with limitations in modeling motion consistency and visual continuity.\nAdditionally, the efficiency of generating videos in pixel space is quite\nlow.In this paper, we propose a novel approach to address these challenges by\ndisentangling the target RGB pixels into two distinct components: spatial\ncontent and temporal motions. Specifically, we predict temporal motions which\ninclude motion vector and residual based on a 3D-UNet diffusion model. By\nexplicitly modeling temporal motions and warping them to the starting image, we\nimprove the temporal consistency of generated videos. This results in a\nreduction of spatial redundancy, emphasizing temporal details. Our proposed\nmethod achieves performance improvements by disentangling content and motion,\nall without introducing new structural complexities to the model. Extensive\nexperiments on various datasets confirm our approach's superior performance\nover the majority of state-of-the-art methods in both effectiveness and\nefficiency.\n","authors":["Cuifeng Shen","Yulu Gan","Chen Chen","Xiongwei Zhu","Lele Cheng","Jinzhi Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14294v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14356v2","updated":"2023-11-24T05:55:12Z","published":"2023-10-22T16:51:42Z","title":"Cultural and Linguistic Diversity Improves Visual Representations","summary":" Computer vision often treats perception as objective, and this assumption\ngets reflected in the way that datasets are collected and models are trained.\nFor instance, image descriptions in different languages are typically assumed\nto be translations of the same semantic content. However, work in\ncross-cultural psychology and linguistics has shown that individuals differ in\ntheir visual perception depending on their cultural background and the language\nthey speak. In this paper, we demonstrate significant differences in semantic\ncontent across languages in both dataset and model-produced captions. When data\nis multilingual as opposed to monolingual, captions have higher semantic\ncoverage on average, as measured by scene graph, embedding, and linguistic\ncomplexity. For example, multilingual captions have on average 21.8% more\nobjects, 24.5% more relations, and 27.1% more attributes than a set of\nmonolingual captions. Moreover, models trained on content from different\nlanguages perform best against test data from those languages, while those\ntrained on multilingual content perform consistently well across all evaluation\ndata compositions. Our research provides implications for how diverse modes of\nperception can improve image understanding.\n","authors":["Andre Ye","Sebastin Santy","Jena D. Hwang","Amy X. Zhang","Ranjay Krishna"],"pdf_url":"https://arxiv.org/pdf/2310.14356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14284v1","updated":"2023-11-24T05:17:01Z","published":"2023-11-24T05:17:01Z","title":"Paragraph-to-Image Generation with Information-Enriched Diffusion Model","summary":" Text-to-image (T2I) models have recently experienced rapid development,\nachieving astonishing performance in terms of fidelity and textual alignment\ncapabilities. However, given a long paragraph (up to 512 words), these\ngeneration models still struggle to achieve strong alignment and are unable to\ngenerate images depicting complex scenes. In this paper, we introduce an\ninformation-enriched diffusion model for paragraph-to-image generation task,\ntermed ParaDiffusion, which delves into the transference of the extensive\nsemantic comprehension capabilities of large language models to the task of\nimage generation. At its core is using a large language model (e.g., Llama V2)\nto encode long-form text, followed by fine-tuning with LORA to alignthe\ntext-image feature spaces in the generation task. To facilitate the training of\nlong-text semantic alignment, we also curated a high-quality paragraph-image\npair dataset, namely ParaImage. This dataset contains a small amount of\nhigh-quality, meticulously annotated data, and a large-scale synthetic dataset\nwith long text descriptions being generated using a vision-language model.\nExperiments demonstrate that ParaDiffusion outperforms state-of-the-art models\n(SD XL, DeepFloyd IF) on ViLG-300 and ParaPrompts, achieving up to 15% and 45%\nhuman voting rate improvements for visual appeal and text faithfulness,\nrespectively. The code and dataset will be released to foster community\nresearch on long-text alignment.\n","authors":["Weijia Wu","Zhuang Li","Yefei He","Mike Zheng Shou","Chunhua Shen","Lele Cheng","Yan Li","Tingting Gao","Di Zhang","Zhongyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14284v1.pdf","comment":"The project website is at:\n https://weijiawu.github.io/ParaDiffusionPage/. Code:\n https://github.com/weijiawu/ParaDiffusion"},{"id":"http://arxiv.org/abs/2311.14282v1","updated":"2023-11-24T05:11:35Z","published":"2023-11-24T05:11:35Z","title":"Image Super-Resolution with Text Prompt Diffusion","summary":" Image super-resolution (SR) methods typically model degradation to improve\nreconstruction accuracy in complex and unknown degradation scenarios. However,\nextracting degradation information from low-resolution images is challenging,\nwhich limits the model performance. To boost image SR performance, one feasible\napproach is to introduce additional priors. Inspired by advancements in\nmulti-modal methods and text prompt image processing, we introduce text prompts\nto image SR to provide degradation priors. Specifically, we first design a\ntext-image generation pipeline to integrate text into SR dataset through the\ntext degradation representation and degradation model. The text representation\napplies a discretization manner based on the binning method to describe the\ndegradation abstractly. This representation method can also maintain the\nflexibility of language. Meanwhile, we propose the PromptSR to realize the text\nprompt SR. The PromptSR employs the diffusion model and the pre-trained\nlanguage model (e.g., T5 and CLIP). We train the model on the generated\ntext-image dataset. Extensive experiments indicate that introducing text\nprompts into image SR, yields excellent results on both synthetic and\nreal-world images. Code: https://github.com/zhengchen1999/PromptSR.\n","authors":["Zheng Chen","Yulun Zhang","Jinjin Gu","Xin Yuan","Linghe Kong","Guihai Chen","Xiaokang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.14282v1.pdf","comment":"Code is available at https://github.com/zhengchen1999/PromptSR"},{"id":"http://arxiv.org/abs/2311.14281v1","updated":"2023-11-24T05:06:28Z","published":"2023-11-24T05:06:28Z","title":"Multi-modal Instance Refinement for Cross-domain Action Recognition","summary":" Unsupervised cross-domain action recognition aims at adapting the model\ntrained on an existing labeled source domain to a new unlabeled target domain.\nMost existing methods solve the task by directly aligning the feature\ndistributions of source and target domains. However, this would cause negative\ntransfer during domain adaptation due to some negative training samples in both\ndomains. In the source domain, some training samples are of low-relevance to\ntarget domain due to the difference in viewpoints, action styles, etc. In the\ntarget domain, there are some ambiguous training samples that can be easily\nclassified as another type of action under the case of source domain. The\nproblem of negative transfer has been explored in cross-domain object\ndetection, while it remains under-explored in cross-domain action recognition.\nTherefore, we propose a Multi-modal Instance Refinement (MMIR) method to\nalleviate the negative transfer based on reinforcement learning. Specifically,\na reinforcement learning agent is trained in both domains for every modality to\nrefine the training data by selecting out negative samples from each domain.\nOur method finally outperforms several other state-of-the-art baselines in\ncross-domain action recognition on the benchmark EPIC-Kitchens dataset, which\ndemonstrates the advantage of MMIR in reducing negative transfer.\n","authors":["Yuan Qing","Naixing Wu","Shaohua Wan","Lixin Duan"],"pdf_url":"https://arxiv.org/pdf/2311.14281v1.pdf","comment":"Accepted by PRCV 2023"},{"id":"http://arxiv.org/abs/2311.14280v1","updated":"2023-11-24T04:55:20Z","published":"2023-11-24T04:55:20Z","title":"Latent Diffusion Prior Enhanced Deep Unfolding for Spectral Image\n Reconstruction","summary":" Snapshot compressive spectral imaging reconstruction aims to reconstruct\nthree-dimensional spatial-spectral images from a single-shot two-dimensional\ncompressed measurement. Existing state-of-the-art methods are mostly based on\ndeep unfolding structures but have intrinsic performance bottlenecks: $i$) the\nill-posed problem of dealing with heavily degraded measurement, and $ii$) the\nregression loss-based reconstruction models being prone to recover images with\nfew details. In this paper, we introduce a generative model, namely the latent\ndiffusion model (LDM), to generate degradation-free prior to enhance the\nregression-based deep unfolding method. Furthermore, to overcome the large\ncomputational cost challenge in LDM, we propose a lightweight model to generate\nknowledge priors in deep unfolding denoiser, and integrate these priors to\nguide the reconstruction process for compensating high-quality spectral signal\ndetails. Numeric and visual comparisons on synthetic and real-world datasets\nillustrate the superiority of our proposed method in both reconstruction\nquality and computational efficiency. Code will be released.\n","authors":["Zongliang Wu","Ruiying Lu","Ying Fu","Xin Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.14280v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14276v1","updated":"2023-11-24T04:40:26Z","published":"2023-11-24T04:40:26Z","title":"Racing With ROS 2 A Navigation System for an Autonomous Formula Student\n Race Car","summary":" The advent of autonomous vehicle technologies has significantly impacted\nvarious sectors, including motorsport, where Formula Student and Formula:\nSociety of Automotive Engineers introduced autonomous racing classes. These\noffer new challenges to aspiring engineers, including the team at QUT\nMotorsport, but also raise the entry barrier due to the complexity of\nhigh-speed navigation and control. This paper presents an open-source solution\nusing the Robot Operating System 2, specifically its open-source navigation\nstack, to address these challenges in autonomous Formula Student race cars. We\ncompare off-the-shelf navigation libraries that this stack comprises of against\ntraditional custom-made programs developed by QUT Motorsport to evaluate their\napplicability in autonomous racing scenarios and integrate them onto an\nautonomous race car. Our contributions include quantitative and qualitative\ncomparisons of these packages against traditional navigation solutions, aiming\nto lower the entry barrier for autonomous racing. This paper also serves as a\ncomprehensive tutorial for teams participating in similar racing disciplines\nand other autonomous mobile robot applications.\n","authors":["Alastair Bradford","Grant van Breda","Tobias Fischer"],"pdf_url":"https://arxiv.org/pdf/2311.14276v1.pdf","comment":"10 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.14275v1","updated":"2023-11-24T04:30:31Z","published":"2023-11-24T04:30:31Z","title":"Cooperative Dual Attention for Audio-Visual Speech Enhancement with\n Facial Cues","summary":" In this work, we focus on leveraging facial cues beyond the lip region for\nrobust Audio-Visual Speech Enhancement (AVSE). The facial region, encompassing\nthe lip region, reflects additional speech-related attributes such as gender,\nskin color, nationality, etc., which contribute to the effectiveness of AVSE.\nHowever, static and dynamic speech-unrelated attributes also exist, causing\nappearance changes during speech. To address these challenges, we propose a\nDual Attention Cooperative Framework, DualAVSE, to ignore speech-unrelated\ninformation, capture speech-related information with facial cues, and\ndynamically integrate it with the audio signal for AVSE. Specifically, we\nintroduce a spatial attention-based visual encoder to capture and enhance\nvisual speech information beyond the lip region, incorporating global facial\ncontext and automatically ignoring speech-unrelated information for robust\nvisual feature extraction. Additionally, a dynamic visual feature fusion\nstrategy is introduced by integrating a temporal-dimensional self-attention\nmodule, enabling the model to robustly handle facial variations. The acoustic\nnoise in the speaking process is variable, impacting audio quality. Therefore,\na dynamic fusion strategy for both audio and visual features is introduced to\naddress this issue. By integrating cooperative dual attention in the visual\nencoder and audio-visual fusion strategy, our model effectively extracts\nbeneficial speech information from both audio and visual cues for AVSE.\nThorough analysis and comparison on different datasets, including normal and\nchallenging cases with unreliable or absent visual information, consistently\nshow our model outperforming existing methods across multiple metrics.\n","authors":["Feixiang Wang","Shuang Yang","Shiguang Shan","Xilin Chen"],"pdf_url":"https://arxiv.org/pdf/2311.14275v1.pdf","comment":"Accepted to BMVC 2023 15 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.14272v1","updated":"2023-11-24T04:16:32Z","published":"2023-11-24T04:16:32Z","title":"CRISP: Hybrid Structured Sparsity for Class-aware Model Pruning","summary":" Machine learning pipelines for classification tasks often train a universal\nmodel to achieve accuracy across a broad range of classes. However, a typical\nuser encounters only a limited selection of classes regularly. This disparity\nprovides an opportunity to enhance computational efficiency by tailoring models\nto focus on user-specific classes. Existing works rely on unstructured pruning,\nwhich introduces randomly distributed non-zero values in the model, making it\nunsuitable for hardware acceleration. Alternatively, some approaches employ\nstructured pruning, such as channel pruning, but these tend to provide only\nminimal compression and may lead to reduced model accuracy. In this work, we\npropose CRISP, a novel pruning framework leveraging a hybrid structured\nsparsity pattern that combines both fine-grained N:M structured sparsity and\ncoarse-grained block sparsity. Our pruning strategy is guided by a\ngradient-based class-aware saliency score, allowing us to retain weights\ncrucial for user-specific classes. CRISP achieves high accuracy with minimal\nmemory consumption for popular models like ResNet-50, VGG-16, and MobileNetV2\non ImageNet and CIFAR-100 datasets. Moreover, CRISP delivers up to 14$\\times$\nreduction in latency and energy consumption compared to existing pruning\nmethods while maintaining comparable accuracy. Our code is available at\nhttps://github.com/shivmgg/CRISP/.\n","authors":["Shivam Aggarwal","Kuluhan Binici","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.14272v1.pdf","comment":"6 pages, accepted in Design, Automation & Test in Europe Conference &\n Exhibition (DATE) 2024"},{"id":"http://arxiv.org/abs/2311.14271v1","updated":"2023-11-24T04:15:10Z","published":"2023-11-24T04:15:10Z","title":"Segmentation-Based Parametric Painting","summary":" We introduce a novel image-to-painting method that facilitates the creation\nof large-scale, high-fidelity paintings with human-like quality and stylistic\nvariation. To process large images and gain control over the painting process,\nwe introduce a segmentation-based painting process and a dynamic attention map\napproach inspired by human painting strategies, allowing optimization of brush\nstrokes to proceed in batches over different image regions, thereby capturing\nboth large-scale structure and fine details, while also allowing stylistic\ncontrol over detail. Our optimized batch processing and patch-based loss\nframework enable efficient handling of large canvases, ensuring our painted\noutputs are both aesthetically compelling and functionally superior as compared\nto previous methods, as confirmed by rigorous evaluations. Code available at:\nhttps://github.com/manuelladron/semantic\\_based\\_painting.git\n","authors":["Manuel Ladron de Guevara","Matthew Fisher","Aaron Hertzmann"],"pdf_url":"https://arxiv.org/pdf/2311.14271v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.14265v1","updated":"2023-11-24T03:43:59Z","published":"2023-11-24T03:43:59Z","title":"Bursting Spikes: Efficient and High-performance SNNs for Event-based\n Vision","summary":" Advancing event-driven vision through spiking neural networks (SNNs) is\ncrucial to empowering high-speed and efficient perception. While directly\nconverting the pre-trained artificial neural networks (ANNs) - by replacing the\nnon-linear activation with spiking neurons - can provide SNNs with good\nperformance, the resultant SNNs typically demand long timesteps and high energy\nconsumption to achieve their optimal performance. To address this challenge, we\nintroduce the burst-spike mechanism inspired by the biological nervous system,\nallowing multiple spikes per timestep to reduce conversion errors and produce\nlow-latency SNNs. To further bolster this enhancement, we leverage the Pareto\nFrontier-driven algorithm to reallocate burst-firing patterns. Moreover, to\nreduce energy consumption during the conversion process, we propose a\nsensitivity-driven spike compression technique, which automatically locates the\noptimal threshold ratio according to layer-specific sensitivity. Extensive\nexperiments demonstrate our approach outperforms state-of-the-art SNN methods,\nshowcasing superior performance and reduced energy usage across classification\nand object detection. Our code will be available at\nhttps://github.com/bic-L/burst-ann2snn.\n","authors":["Ziqing Wang","Yuetong Fang","Jiahang Cao","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14265v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2309.08273v3","updated":"2023-11-24T03:23:32Z","published":"2023-09-15T09:34:05Z","title":"Unsupervised Disentangling of Facial Representations with 3D-aware\n Latent Diffusion Models","summary":" Unsupervised learning of facial representations has gained increasing\nattention for face understanding ability without heavily relying on large-scale\nannotated datasets. However, it remains unsolved due to the coupling of facial\nidentities, expressions, and external factors like pose and light. Prior\nmethods primarily focus on 2D factors and pixel-level consistency, leading to\nincomplete disentangling and suboptimal performance in downstream tasks. In\nthis paper, we propose LatentFace, a novel unsupervised disentangling framework\nfor facial expression and identity representation. We suggest the disentangling\nproblem should be performed in latent space and propose the solution using a\n3D-aware latent diffusion model. First, we introduce a 3D-aware autoencoder to\nencode face images into 3D latent embeddings. Second, we propose a novel\nrepresentation diffusion model (RDM) to disentangle 3D latent into facial\nidentity and expression. Consequently, our method achieves state-of-the-art\nperformance in facial expression recognition and face verification among\nunsupervised facial representation learning models. Codes are available at\n\\url{https://github.com/ryanhe312/LatentFace}.\n","authors":["Ruian He","Zhen Xing","Weimin Tan","Bo Yan"],"pdf_url":"https://arxiv.org/pdf/2309.08273v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14262v1","updated":"2023-11-24T03:19:17Z","published":"2023-11-24T03:19:17Z","title":"ZeroPS: High-quality Cross-modal Knowledge Transfer for Zero-Shot 3D\n Part Segmentation","summary":" Recently, many 2D pretrained foundational models have demonstrated impressive\nzero-shot prediction capabilities. In this work, we design a novel pipeline for\nzero-shot 3D part segmentation, called ZeroPS. It high-quality transfers\nknowledge from 2D pretrained foundational models to 3D point clouds. The main\nidea of our approach is to explore the natural relationship between multi-view\ncorrespondences and the prompt mechanism of foundational models and build\nbridges on it. Our pipeline consists of two components: 1) a self-extension\ncomponent that extends 2D groups from a single viewpoint to spatial\nglobal-level 3D groups; 2) a multi-modal labeling component that introduces a\ntwo-dimensional checking mechanism to vote each 2D predicted bounding box to\nthe best matching 3D part, and a Class Non-highest Vote Penalty function to\nrefine the Vote Matrix. Additionally, a merging algorithm is included to merge\npart-level 3D groups. Extensive evaluation of three zero-shot segmentation\ntasks on PartnetE datasets, achieving state-of-the-art results with significant\nimprovements (+19.6%, +5.2% and +4.9%, respectively) over existing methods. Our\nproposed approach does not need any training, fine-tuning or learnable\nparameters. It is hardly affected by domain shift. The code will be released.\n","authors":["Yuheng Xue","Nenglun Chen","Jun Liu","Wenyun Sun"],"pdf_url":"https://arxiv.org/pdf/2311.14262v1.pdf","comment":"11 pages, 6 figures; references added"},{"id":"http://arxiv.org/abs/2307.15064v2","updated":"2023-11-24T02:58:58Z","published":"2023-07-27T17:59:59Z","title":"Self-Supervised Visual Acoustic Matching","summary":" Acoustic matching aims to re-synthesize an audio clip to sound as if it were\nrecorded in a target acoustic environment. Existing methods assume access to\npaired training data, where the audio is observed in both source and target\nenvironments, but this limits the diversity of training data or requires the\nuse of simulated data or heuristics to create paired samples. We propose a\nself-supervised approach to visual acoustic matching where training samples\ninclude only the target scene image and audio -- without acoustically\nmismatched source audio for reference. Our approach jointly learns to\ndisentangle room acoustics and re-synthesize audio into the target environment,\nvia a conditional GAN framework and a novel metric that quantifies the level of\nresidual acoustic information in the de-biased audio. Training with either\nin-the-wild web data or simulated data, we demonstrate it outperforms the\nstate-of-the-art on multiple challenging datasets and a wide variety of\nreal-world audio and environments.\n","authors":["Arjun Somayazulu","Changan Chen","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.15064v2.pdf","comment":"Project page: https://vision.cs.utexas.edu/projects/ss_vam/ .\n Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2307.16449v2","updated":"2023-11-24T02:43:18Z","published":"2023-07-31T07:15:45Z","title":"MovieChat: From Dense Token to Sparse Memory for Long Video\n Understanding","summary":" Recently, integrating video foundation models and large language models to\nbuild a video understanding system can overcome the limitations of specific\npre-defined vision tasks. Yet, existing systems can only handle videos with\nvery few frames. For long videos, the computation complexity, memory cost, and\nlong-term temporal connection impose additional challenges. Taking advantage of\nthe Atkinson-Shiffrin memory model, with tokens in Transformers being employed\nas the carriers of memory in combination with our specially designed memory\nmechanism, we propose the MovieChat to overcome these challenges. MovieChat\nachieves state-of-the-art performance in long video understanding, along with\nthe released MovieChat-1K benchmark with 1K long video and 14K manual\nannotations for validation of the effectiveness of our method.\n","authors":["Enxin Song","Wenhao Chai","Guanhong Wang","Yucheng Zhang","Haoyang Zhou","Feiyang Wu","Haozhe Chi","Xun Guo","Tian Ye","Yanting Zhang","Yan Lu","Jenq-Neng Hwang","Gaoang Wang"],"pdf_url":"https://arxiv.org/pdf/2307.16449v2.pdf","comment":"Preprint. Project Website https://rese1f.github.io/MovieChat/"},{"id":"http://arxiv.org/abs/2311.14242v1","updated":"2023-11-24T01:15:57Z","published":"2023-11-24T01:15:57Z","title":"RSB-Pose: Robust Short-Baseline Binocular 3D Human Pose Estimation with\n Occlusion Handling","summary":" In the domain of 3D Human Pose Estimation, which finds widespread daily\napplications, the requirement for convenient acquisition equipment continues to\ngrow. To satisfy this demand, we set our sights on a short-baseline binocular\nsetting that offers both portability and a geometric measurement property that\nradically mitigates depth ambiguity. However, as the binocular baseline\nshortens, two serious challenges emerge: first, the robustness of 3D\nreconstruction against 2D errors deteriorates; and second, occlusion reoccurs\ndue to the limited visual differences between two views. To address the first\nchallenge, we propose the Stereo Co-Keypoints Estimation module to improve the\nview consistency of 2D keypoints and enhance the 3D robustness. In this module,\nthe disparity is utilized to represent the correspondence of binocular 2D\npoints and the Stereo Volume Feature is introduced to contain binocular\nfeatures across different disparities. Through the regression of SVF, two-view\n2D keypoints are simultaneously estimated in a collaborative way which\nrestricts their view consistency. Furthermore, to deal with occlusions, a\nPre-trained Pose Transformer module is introduced. Through this module, 3D\nposes are refined by perceiving pose coherence, a representation of joint\ncorrelations. This perception is injected by the Pose Transformer network and\nlearned through a pre-training task that recovers iterative masked joints.\nComprehensive experiments carried out on H36M and MHAD datasets, complemented\nby visualizations, validate the effectiveness of our approach in the\nshort-baseline binocular 3D Human Pose Estimation and occlusion handling.\n","authors":["Xiaoyue Wan","Zhuo Chen","Yiming Bao","Xu Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.14242v1.pdf","comment":"13 pages, 8 figures, currently under review at IEEE Transactions on\n Image Processing journal"},{"id":"http://arxiv.org/abs/2311.14237v1","updated":"2023-11-24T00:36:17Z","published":"2023-11-24T00:36:17Z","title":"Pseudo-label Correction for Instance-dependent Noise Using\n Teacher-student Framework","summary":" The high capacity of deep learning models to learn complex patterns poses a\nsignificant challenge when confronted with label noise. The inability to\ndifferentiate clean and noisy labels ultimately results in poor generalization.\nWe approach this problem by reassigning the label for each image using a new\nteacher-student based framework termed P-LC (pseudo-label correction).\nTraditional teacher-student networks are composed of teacher and student\nclassifiers for knowledge distillation. In our novel approach, we reconfigure\nthe teacher network into a triple encoder, leveraging the triplet loss to\nestablish a pseudo-label correction system. As the student generates pseudo\nlabels for a set of given images, the teacher learns to choose between the\ninitially assigned labels and the pseudo labels. Experiments on MNIST,\nFashion-MNIST, and SVHN demonstrate P-LC's superior performance over existing\nstate-of-the-art methods across all noise levels, most notably in high noise.\nIn addition, we introduce a noise level estimation to help assess model\nperformance and inform the need for additional data cleaning procedures.\n","authors":["Eugene Kim"],"pdf_url":"https://arxiv.org/pdf/2311.14237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01017v2","updated":"2023-11-24T00:24:06Z","published":"2023-11-02T06:21:56Z","title":"Learning Unsupervised World Models for Autonomous Driving via Discrete\n Diffusion","summary":" Learning world models can teach an agent how the world works in an\nunsupervised manner. Even though it can be viewed as a special case of sequence\nmodeling, progress for scaling world models on robotic applications such as\nautonomous driving has been somewhat less rapid than scaling language models\nwith Generative Pre-trained Transformers (GPT). We identify two reasons as\nmajor bottlenecks: dealing with complex and unstructured observation space, and\nhaving a scalable generative model. Consequently, we propose a novel world\nmodeling approach that first tokenizes sensor observations with VQVAE, then\npredicts the future via discrete diffusion. To efficiently decode and denoise\ntokens in parallel, we recast Masked Generative Image Transformer into the\ndiscrete diffusion framework with a few simple changes, resulting in notable\nimprovement. When applied to learning world models on point cloud observations,\nour model reduces prior SOTA Chamfer distance by more than 65% for 1s\nprediction, and more than 50% for 3s prediction, across NuScenes, KITTI\nOdometry, and Argoverse2 datasets. Our results demonstrate that discrete\ndiffusion on tokenized agent experience can unlock the power of GPT-like\nunsupervised learning for robotic agents.\n","authors":["Lunjun Zhang","Yuwen Xiong","Ze Yang","Sergio Casas","Rui Hu","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2311.01017v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.14583v1","updated":"2023-11-24T16:19:04Z","published":"2023-11-24T16:19:04Z","title":"GPT Struct Me: Probing GPT Models on Narrative Entity Extraction","summary":" The importance of systems that can extract structured information from\ntextual data becomes increasingly pronounced given the ever-increasing volume\nof text produced on a daily basis. Having a system that can effectively extract\nsuch information in an interoperable manner would be an asset for several\ndomains, be it finance, health, or legal. Recent developments in natural\nlanguage processing led to the production of powerful language models that can,\nto some degree, mimic human intelligence. Such effectiveness raises a pertinent\nquestion: Can these models be leveraged for the extraction of structured\ninformation? In this work, we address this question by evaluating the\ncapabilities of two state-of-the-art language models -- GPT-3 and GPT-3.5,\ncommonly known as ChatGPT -- in the extraction of narrative entities, namely\nevents, participants, and temporal expressions. This study is conducted on the\nText2Story Lusa dataset, a collection of 119 Portuguese news articles whose\nannotation framework includes a set of entity structures along with several\ntags and attribute values. We first select the best prompt template through an\nablation study over prompt components that provide varying degrees of\ninformation on a subset of documents of the dataset. Subsequently, we use the\nbest templates to evaluate the effectiveness of the models on the remaining\ndocuments. The results obtained indicate that GPT models are competitive with\nout-of-the-box baseline systems, presenting an all-in-one alternative for\npractitioners with limited resources. By studying the strengths and limitations\nof these models in the context of information extraction, we offer insights\nthat can guide future improvements and avenues to explore in this field.\n","authors":["Hugo Sousa","Nuno Guimarães","Alípio Jorge","Ricardo Campos"],"pdf_url":"https://arxiv.org/pdf/2311.14583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.02892v2","updated":"2023-11-24T02:40:44Z","published":"2022-09-07T02:34:39Z","title":"A Systematical Evaluation for Next-Basket Recommendation Algorithms","summary":" Next basket recommender systems (NBRs) aim to recommend a user's next\n(shopping) basket of items via modeling the user's preferences towards items\nbased on the user's purchase history, usually a sequence of historical baskets.\nDue to its wide applicability in the real-world E-commerce industry, the\nstudies NBR have attracted increasing attention in recent years. NBRs have been\nwidely studied and much progress has been achieved in this area with a variety\nof NBR approaches having been proposed. However, an important issue is that\nthere is a lack of a systematic and unified evaluation over the various NBR\napproaches. Different studies often evaluate NBR approaches on different\ndatasets, under different experimental settings, making it hard to fairly and\neffectively compare the performance of different NBR approaches. To bridge this\ngap, in this work, we conduct a systematical empirical study in NBR area.\nSpecifically, we review the representative work in NBR and analyze their cons\nand pros. Then, we run the selected NBR algorithms on the same datasets, under\nthe same experimental setting and evaluate their performances using the same\nmeasurements. This provides a unified framework to fairly compare different NBR\napproaches. We hope this study can provide a valuable reference for the future\nresearch in this vibrant area.\n","authors":["Zhufeng Shao","Shoujin Wang","Qian Zhang","Wenpeng Lu","Zhao Li","Xueping Peng"],"pdf_url":"https://arxiv.org/pdf/2209.02892v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14837v1","updated":"2023-11-24T20:16:38Z","published":"2023-11-24T20:16:38Z","title":"Benchmarking Robustness of Text-Image Composed Retrieval","summary":" Text-image composed retrieval aims to retrieve the target image through the\ncomposed query, which is specified in the form of an image plus some text that\ndescribes desired modifications to the input image. It has recently attracted\nattention due to its ability to leverage both information-rich images and\nconcise language to precisely express the requirements for target images.\nHowever, the robustness of these approaches against real-world corruptions or\nfurther text understanding has never been studied. In this paper, we perform\nthe first robustness study and establish three new diversified benchmarks for\nsystematic analysis of text-image composed retrieval against natural\ncorruptions in both vision and text and further probe textural understanding.\nFor natural corruption analysis, we introduce two new large-scale benchmark\ndatasets, CIRR-C and FashionIQ-C for testing in open domain and fashion domain\nrespectively, both of which apply 15 visual corruptions and 7 textural\ncorruptions. For textural understanding analysis, we introduce a new diagnostic\ndataset CIRR-D by expanding the original raw data with synthetic data, which\ncontains modified text to better probe textual understanding ability including\nnumerical variation, attribute variation, object removal, background variation,\nand fine-grained evaluation. The code and benchmark datasets are available at\nhttps://github.com/SunTongtongtong/Benchmark-Robustness-Text-Image-Compose-Retrieval.\n","authors":["Shitong Sun","Jindong Gu","Shaogang Gong"],"pdf_url":"https://arxiv.org/pdf/2311.14837v1.pdf","comment":"Accepted by R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot\n Learning in Foundation Models at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.14778v1","updated":"2023-11-24T15:46:32Z","published":"2023-11-24T15:46:32Z","title":"Anomaly detection in cross-country money transfer temporal networks","summary":" During the last decades, Anti-Financial Crime (AFC) entities and Financial\nInstitutions have put a constantly increasing effort to reduce financial crime\nand detect fraudulent activities, that are changing and developing in extremely\ncomplex ways. We propose an anomaly detection approach based on network\nanalysis to help AFC officers navigating through the high load of information\nthat is typical of AFC data-driven scenarios. By experimenting on a large\nfinancial dataset of more than 80M cross-country wire transfers, we leverage on\nthe properties of complex networks to develop a tool for explainable anomaly\ndetection, that can help in identifying outliers that could be engaged in\npotentially malicious activities according to financial regulations. We\nidentify a set of network centrality measures that provide useful insights on\nindividual nodes; by keeping track of the evolution over time of the\ncentrality-based node rankings, we are able to highlight sudden and unexpected\nchanges in the roles of individual nodes that deserve further attention by AFC\nofficers. Such changes can hardly be noticed by means of current AFC practices,\nthat sometimes can lack a higher-level, global vision of the system. This\napproach represents a preliminary step in the automation of AFC and AML\nprocesses, serving the purpose of facilitating the work of AFC officers by\nproviding them with a top-down view of the picture emerging from financial\ndata.\n","authors":["Salvatore Vilella","Arthur Thomas Edward Capozzi Lupi","Marco Fornasiero","Dario Moncalvo","Valeria Ricci","Silvia Ronchiadin","Giancarlo Ruffo"],"pdf_url":"https://arxiv.org/pdf/2311.14778v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.14670v1","updated":"2023-11-24T18:59:04Z","published":"2023-11-24T18:59:04Z","title":"Differentiable and accelerated spherical harmonic and Wigner transforms","summary":" Many areas of science and engineering encounter data defined on spherical\nmanifolds. Modelling and analysis of spherical data often necessitates\nspherical harmonic transforms, at high degrees, and increasingly requires\nefficient computation of gradients for machine learning or other differentiable\nprogramming tasks. We develop novel algorithmic structures for accelerated and\ndifferentiable computation of generalised Fourier transforms on the sphere\n$\\mathbb{S}^2$ and rotation group $\\text{SO}(3)$, i.e. spherical harmonic and\nWigner transforms, respectively. We present a recursive algorithm for the\ncalculation of Wigner $d$-functions that is both stable to high harmonic\ndegrees and extremely parallelisable. By tightly coupling this with separable\nspherical transforms, we obtain algorithms that exhibit an extremely\nparallelisable structure that is well-suited for the high throughput computing\nof modern hardware accelerators (e.g. GPUs). We also develop a hybrid automatic\nand manual differentiation approach so that gradients can be computed\nefficiently. Our algorithms are implemented within the JAX differentiable\nprogramming framework in the S2FFT software code. Numerous samplings of the\nsphere are supported, including equiangular and HEALPix sampling. Computational\nerrors are at the order of machine precision for spherical samplings that admit\na sampling theorem. When benchmarked against alternative C codes we observe up\nto a 400-fold acceleration. Furthermore, when distributing over multiple GPUs\nwe achieve very close to optimal linear scaling with increasing number of GPUs\ndue to the highly parallelised and balanced nature of our algorithms. Provided\naccess to sufficiently many GPUs our transforms thus exhibit an unprecedented\neffective linear time complexity.\n","authors":["Matthew A. Price","Jason D. McEwen"],"pdf_url":"https://arxiv.org/pdf/2311.14670v1.pdf","comment":"30 pages, 7 figures, code available at\n https://github.com/astro-informatics/s2fft"},{"id":"http://arxiv.org/abs/2211.11744v3","updated":"2023-11-24T18:53:31Z","published":"2022-11-21T18:59:33Z","title":"Visual Dexterity: In-Hand Reorientation of Novel and Complex Object\n Shapes","summary":" In-hand object reorientation is necessary for performing many dexterous\nmanipulation tasks, such as tool use in less structured environments that\nremain beyond the reach of current robots. Prior works built reorientation\nsystems assuming one or many of the following: reorienting only specific\nobjects with simple shapes, limited range of reorientation, slow or quasistatic\nmanipulation, simulation-only results, the need for specialized and costly\nsensor suites, and other constraints which make the system infeasible for\nreal-world deployment. We present a general object reorientation controller\nthat does not make these assumptions. It uses readings from a single commodity\ndepth camera to dynamically reorient complex and new object shapes by any\nrotation in real-time, with the median reorientation time being close to seven\nseconds. The controller is trained using reinforcement learning in simulation\nand evaluated in the real world on new object shapes not used for training,\nincluding the most challenging scenario of reorienting objects held in the air\nby a downward-facing hand that must counteract gravity during reorientation.\nOur hardware platform only uses open-source components that cost less than five\nthousand dollars. Although we demonstrate the ability to overcome assumptions\nin prior work, there is ample scope for improving absolute performance. For\ninstance, the challenging duck-shaped object not used for training was dropped\nin 56 percent of the trials. When it was not dropped, our controller reoriented\nthe object within 0.4 radians (23 degrees) 75 percent of the time. Videos are\navailable at: https://taochenshh.github.io/projects/visual-dexterity.\n","authors":["Tao Chen","Megha Tippur","Siyang Wu","Vikash Kumar","Edward Adelson","Pulkit Agrawal"],"pdf_url":"https://arxiv.org/pdf/2211.11744v3.pdf","comment":"Published in Science Robotics:\n https://www.science.org/doi/10.1126/scirobotics.adc9244"},{"id":"http://arxiv.org/abs/2311.14658v1","updated":"2023-11-24T18:46:54Z","published":"2023-11-24T18:46:54Z","title":"Convergence Analysis for Learning Orthonormal Deep Linear Neural\n Networks","summary":" Enforcing orthonormal or isometric property for the weight matrices has been\nshown to enhance the training of deep neural networks by mitigating gradient\nexploding/vanishing and increasing the robustness of the learned networks.\nHowever, despite its practical performance, the theoretical analysis of\northonormality in neural networks is still lacking; for example, how\northonormality affects the convergence of the training process. In this letter,\nwe aim to bridge this gap by providing convergence analysis for training\northonormal deep linear neural networks. Specifically, we show that Riemannian\ngradient descent with an appropriate initialization converges at a linear rate\nfor training orthonormal deep linear neural networks with a class of loss\nfunctions. Unlike existing works that enforce orthonormal weight matrices for\nall the layers, our approach excludes this requirement for one layer, which is\ncrucial to establish the convergence guarantee. Our results shed light on how\nincreasing the number of hidden layers can impact the convergence speed.\nExperimental results validate our theoretical analysis.\n","authors":["Zhen Qin","Xuwei Tan","Zhihui Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.14658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14654v1","updated":"2023-11-24T18:38:13Z","published":"2023-11-24T18:38:13Z","title":"JetLOV: Enhancing Jet Tree Tagging through Neural Network Learning of\n Optimal LundNet Variables","summary":" Machine learning has played a pivotal role in advancing physics, with deep\nlearning notably contributing to solving complex classification problems such\nas jet tagging in the field of jet physics. In this experiment, we aim to\nharness the full potential of neural networks while acknowledging that, at\ntimes, we may lose sight of the underlying physics governing these models.\nNevertheless, we demonstrate that we can achieve remarkable results obscuring\nphysics knowledge and relying completely on the model's outcome. We introduce\nJetLOV, a composite comprising two models: a straightforward multilayer\nperceptron (MLP) and the well-established LundNet. Our study reveals that we\ncan attain comparable jet tagging performance without relying on the\npre-computed LundNet variables. Instead, we allow the network to autonomously\nlearn an entirely new set of variables, devoid of a priori knowledge of the\nunderlying physics. These findings hold promise, particularly in addressing the\nissue of model dependence, which can be mitigated through generalization and\ntraining on diverse data sets.\n","authors":["Mauricio A. Diaz","Giorgio Cerro","Jacan Chaplais","Srinandan Dasmahapatra","Stefano Moretti"],"pdf_url":"https://arxiv.org/pdf/2311.14654v1.pdf","comment":"Accepted at the NeurIPS 2023 workshop: Machine Learning and the\n Physical Sciences"},{"id":"http://arxiv.org/abs/2311.14653v1","updated":"2023-11-24T18:37:52Z","published":"2023-11-24T18:37:52Z","title":"Data-driven Prior Learning for Bayesian Optimisation","summary":" Transfer learning for Bayesian optimisation has generally assumed a strong\nsimilarity between optimisation tasks, with at least a subset having similar\noptimal inputs. This assumption can reduce computational costs, but it is\nviolated in a wide range of optimisation problems where transfer learning may\nnonetheless be useful. We replace this assumption with a weaker one only\nrequiring the shape of the optimisation landscape to be similar, and analyse\nthe recent method Prior Learning for Bayesian Optimisation - PLeBO - in this\nsetting. By learning priors for the hyperparameters of the Gaussian process\nsurrogate model we can better approximate the underlying function, especially\nfor few function evaluations. We validate the learned priors and compare to a\nbreadth of transfer learning approaches, using synthetic data and a recent air\npollution optimisation problem as benchmarks. We show that PLeBO and prior\ntransfer find good inputs in fewer evaluations.\n","authors":["Sigrid Passano Hellan","Christopher G. Lucas","Nigel H. Goddard"],"pdf_url":"https://arxiv.org/pdf/2311.14653v1.pdf","comment":"To be presented at the NeurIPS 2023 Workshop on Adaptive Experimental\n Design and Active Learning in the Real World"},{"id":"http://arxiv.org/abs/2311.14652v1","updated":"2023-11-24T18:35:00Z","published":"2023-11-24T18:35:00Z","title":"One Pass Streaming Algorithm for Super Long Token Attention\n Approximation in Sublinear Space","summary":" Deploying Large Language Models (LLMs) in streaming applications that involve\nlong contexts, particularly for extended dialogues and text analysis, is of\nparamount importance but presents two significant challenges. Firstly, the\nmemory consumption is substantial during the decoding phase due to the caching\nof Key and Value states (KV) of previous tokens. Secondly, attention\ncomputation is time-consuming with a time complexity of $O(n^2)$ for the\ngeneration of each token. In recent OpenAI DevDay (Nov 6, 2023), OpenAI\nreleased a new model that is able to support a 128K-long document, in our\npaper, we focus on the memory-efficient issue when context length $n$ is much\ngreater than 128K ($n \\gg 2^d$). Considering a single-layer self-attention with\nQuery, Key, and Value matrices $Q, K, V \\in \\mathbb{R}^{n \\times d}$, the\npolynomial method approximates the attention output $T \\in \\mathbb{R}^{n \\times\nd}$. It accomplishes this by constructing $U_1, U_2 \\in \\mathbb{R}^{n \\times\nt}$ to expedite attention ${\\sf Attn}(Q, K, V)$ computation within $n^{1+o(1)}$\ntime executions. Despite this, storing the Key and Value matrices $K, V \\in\n\\mathbb{R}^{n \\times d}$ still necessitates $O( n d)$ space, leading to\nsignificant memory usage. In response to these challenges, we introduce a new\nalgorithm that only reads one pass of the data in streaming fashion. This\nmethod employs sublinear space $o(n)$ to store three sketch matrices,\nalleviating the need for exact $K, V$ storage. Notably, our algorithm exhibits\nexceptional memory-efficient performance with super-long tokens. As the token\nlength $n$ increases, our error guarantee diminishes while the memory usage\nremains nearly constant. This unique attribute underscores the potential of our\ntechnique in efficiently handling LLMs in streaming applications.\n","authors":["Raghav Addanki","Chenyang Li","Zhao Song","Chiwun Yang"],"pdf_url":"https://arxiv.org/pdf/2311.14652v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01225v3","updated":"2023-11-24T18:32:25Z","published":"2023-10-02T14:12:53Z","title":"A path-norm toolkit for modern networks: consequences, promises and\n challenges","summary":" This work introduces the first toolkit around path-norms that is fully able\nto encompass general DAG ReLU networks with biases, skip connections and any\noperation based on the extraction of order statistics: max pooling, GroupSort\netc. This toolkit notably allows us to establish generalization bounds for\nmodern neural networks that are not only the most widely applicable path-norm\nbased ones, but also recover or beat the sharpest known bounds of this type.\nThese extended path-norms further enjoy the usual benefits of path-norms: ease\nof computation, invariance under the symmetries of the network, and improved\nsharpness on feedforward networks compared to the product of operators' norms,\nanother complexity measure most commonly used.\n The versatility of the toolkit and its ease of implementation allow us to\nchallenge the concrete promises of path-norm-based generalization bounds, by\nnumerically evaluating the sharpest known bounds for ResNets on ImageNet.\n","authors":["Antoine Gonon","Nicolas Brisebarre","Elisa Riccietti","Rémi Gribonval"],"pdf_url":"https://arxiv.org/pdf/2310.01225v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13180v2","updated":"2023-11-24T18:31:21Z","published":"2023-11-22T06:06:54Z","title":"Provably Efficient High-Dimensional Bandit Learning with Batched\n Feedbacks","summary":" We study high-dimensional multi-armed contextual bandits with batched\nfeedback where the $T$ steps of online interactions are divided into $L$\nbatches. In specific, each batch collects data according to a policy that\ndepends on previous batches and the rewards are revealed only at the end of the\nbatch. Such a feedback structure is popular in applications such as\npersonalized medicine and online advertisement, where the online data often do\nnot arrive in a fully serial manner. We consider high-dimensional and linear\nsettings where the reward function of the bandit model admits either a sparse\nor low-rank structure and ask how small a number of batches are needed for a\ncomparable performance with fully dynamic data in which $L = T$. For these\nsettings, we design a provably sample-efficient algorithm which achieves a $\n\\mathcal{\\tilde O}(s_0^2 \\log^2 T)$ regret in the sparse case and $\n\\mathcal{\\tilde O} ( r ^2 \\log^2 T)$ regret in the low-rank case, using only $L\n= \\mathcal{O}( \\log T)$ batches. Here $s_0$ and $r$ are the sparsity and rank\nof the reward parameter in sparse and low-rank cases, respectively, and $\n\\mathcal{\\tilde O}(\\cdot)$ omits logarithmic factors involving the feature\ndimensions. In other words, our algorithm achieves regret bounds comparable to\nthose in fully sequential setting with only $\\mathcal{O}( \\log T)$ batches. Our\nalgorithm features a novel batch allocation method that adjusts the batch sizes\naccording to the estimation accuracy within each batch and cumulative regret.\nFurthermore, we also conduct experiments with synthetic and real-world data to\nvalidate our theory.\n","authors":["Jianqing Fan","Zhaoran Wang","Zhuoran Yang","Chenlu Ye"],"pdf_url":"https://arxiv.org/pdf/2311.13180v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14649v1","updated":"2023-11-24T18:31:11Z","published":"2023-11-24T18:31:11Z","title":"Learning in Deep Factor Graphs with Gaussian Belief Propagation","summary":" We propose an approach to do learning in Gaussian factor graphs. We treat all\nrelevant quantities (inputs, outputs, parameters, latents) as random variables\nin a graphical model, and view both training and prediction as inference\nproblems with different observed nodes. Our experiments show that these\nproblems can be efficiently solved with belief propagation (BP), whose updates\nare inherently local, presenting exciting opportunities for distributed and\nasynchronous training. Our approach can be scaled to deep networks and provides\na natural means to do continual learning: use the BP-estimated parameter\nmarginals of the current task as parameter priors for the next. On a video\ndenoising task we demonstrate the benefit of learnable parameters over a\nclassical factor graph approach and we show encouraging performance of deep\nfactor graphs for continual image classification on MNIST.\n","authors":["Seth Nabarro","Mark van der Wilk","Andrew J Davison"],"pdf_url":"https://arxiv.org/pdf/2311.14649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14646v1","updated":"2023-11-24T18:27:41Z","published":"2023-11-24T18:27:41Z","title":"More is Better in Modern Machine Learning: when Infinite\n Overparameterization is Optimal and Overfitting is Obligatory","summary":" In our era of enormous neural networks, empirical progress has been driven by\nthe philosophy that more is better. Recent deep learning practice has found\nrepeatedly that larger model size, more data, and more computation (resulting\nin lower training loss) improves performance. In this paper, we give\ntheoretical backing to these empirical observations by showing that these three\nproperties hold in random feature (RF) regression, a class of models equivalent\nto shallow networks with only the last layer trained.\n Concretely, we first show that the test risk of RF regression decreases\nmonotonically with both the number of features and the number of samples,\nprovided the ridge penalty is tuned optimally. In particular, this implies that\ninfinite width RF architectures are preferable to those of any finite width. We\nthen proceed to demonstrate that, for a large class of tasks characterized by\npowerlaw eigenstructure, training to near-zero training loss is obligatory:\nnear-optimal performance can only be achieved when the training error is much\nsmaller than the test error. Grounding our theory in real-world data, we find\nempirically that standard computer vision tasks with convolutional neural\ntangent kernels clearly fall into this class. Taken together, our results tell\na simple, testable story of the benefits of overparameterization, overfitting,\nand more data in random feature models.\n","authors":["James B. Simon","Dhruva Karkada","Nikhil Ghosh","Mikhail Belkin"],"pdf_url":"https://arxiv.org/pdf/2311.14646v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14645v1","updated":"2023-11-24T18:27:26Z","published":"2023-11-24T18:27:26Z","title":"A General Framework for User-Guided Bayesian Optimization","summary":" The optimization of expensive-to-evaluate black-box functions is prevalent in\nvarious scientific disciplines. Bayesian optimization is an automatic, general\nand sample-efficient method to solve these problems with minimal knowledge of\nthe underlying function dynamics. However, the ability of Bayesian optimization\nto incorporate prior knowledge or beliefs about the function at hand in order\nto accelerate the optimization is limited, which reduces its appeal for\nknowledgeable practitioners with tight budgets. To allow domain experts to\ncustomize the optimization routine, we propose ColaBO, the first\nBayesian-principled framework for incorporating prior beliefs beyond the\ntypical kernel structure, such as the likely location of the optimizer or the\noptimal value. The generality of ColaBO makes it applicable across different\nMonte Carlo acquisition functions and types of user beliefs. We empirically\ndemonstrate ColaBO's ability to substantially accelerate optimization when the\nprior information is accurate, and to retain approximately default performance\nwhen it is misleading.\n","authors":["Carl Hvarfner","Frank Hutter","Luigi Nardi"],"pdf_url":"https://arxiv.org/pdf/2311.14645v1.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.01769v3","updated":"2023-11-24T18:08:25Z","published":"2023-10-03T03:34:22Z","title":"How Over-Parameterization Slows Down Gradient Descent in Matrix Sensing:\n The Curses of Symmetry and Initialization","summary":" This paper rigorously shows how over-parameterization changes the convergence\nbehaviors of gradient descent (GD) for the matrix sensing problem, where the\ngoal is to recover an unknown low-rank ground-truth matrix from near-isotropic\nlinear measurements. First, we consider the symmetric setting with the\nsymmetric parameterization where $M^* \\in \\mathbb{R}^{n \\times n}$ is a\npositive semi-definite unknown matrix of rank $r \\ll n$, and one uses a\nsymmetric parameterization $XX^\\top$ to learn $M^*$. Here $X \\in \\mathbb{R}^{n\n\\times k}$ with $k > r$ is the factor matrix. We give a novel $\\Omega (1/T^2)$\nlower bound of randomly initialized GD for the over-parameterized case ($k >r$)\nwhere $T$ is the number of iterations. This is in stark contrast to the\nexact-parameterization scenario ($k=r$) where the convergence rate is $\\exp\n(-\\Omega (T))$. Next, we study asymmetric setting where $M^* \\in\n\\mathbb{R}^{n_1 \\times n_2}$ is the unknown matrix of rank $r \\ll\n\\min\\{n_1,n_2\\}$, and one uses an asymmetric parameterization $FG^\\top$ to\nlearn $M^*$ where $F \\in \\mathbb{R}^{n_1 \\times k}$ and $G \\in \\mathbb{R}^{n_2\n\\times k}$. Building on prior work, we give a global exact convergence result\nof randomly initialized GD for the exact-parameterization case ($k=r$) with an\n$\\exp (-\\Omega(T))$ rate. Furthermore, we give the first global exact\nconvergence result for the over-parameterization case ($k>r$) with an\n$\\exp(-\\Omega(\\alpha^2 T))$ rate where $\\alpha$ is the initialization scale.\nThis linear convergence result in the over-parameterization case is especially\nsignificant because one can apply the asymmetric parameterization to the\nsymmetric setting to speed up from $\\Omega (1/T^2)$ to linear convergence. On\nthe other hand, we propose a novel method that only modifies one step of GD and\nobtains a convergence rate independent of $\\alpha$, recovering the rate in the\nexact-parameterization case.\n","authors":["Nuoya Xiong","Lijun Ding","Simon S. Du"],"pdf_url":"https://arxiv.org/pdf/2310.01769v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14632v1","updated":"2023-11-24T17:56:44Z","published":"2023-11-24T17:56:44Z","title":"Differentially Private SGD Without Clipping Bias: An Error-Feedback\n Approach","summary":" Differentially Private Stochastic Gradient Descent with gradient clipping\n(DPSGD-GC) is a powerful tool for training deep learning models using sensitive\ndata, providing both a solid theoretical privacy guarantee and high efficiency.\nHowever, using DPSGD-GC to ensure Differential Privacy (DP) comes at the cost\nof model performance degradation due to DP noise injection and gradient\nclipping. Existing research has extensively analyzed the theoretical\nconvergence of DPSGD-GC, and has shown that it only converges when using large\nclipping thresholds that are dependent on problem-specific parameters.\nUnfortunately, these parameters are often unknown in practice, making it hard\nto choose the optimal clipping threshold. Therefore, in practice, DPSGD-GC\nsuffers from degraded performance due to the {\\it constant} bias introduced by\nthe clipping.\n In our work, we propose a new error-feedback (EF) DP algorithm as an\nalternative to DPSGD-GC, which not only offers a diminishing utility bound\nwithout inducing a constant clipping bias, but more importantly, it allows for\nan arbitrary choice of clipping threshold that is independent of the problem.\nWe establish an algorithm-specific DP analysis for our proposed algorithm,\nproviding privacy guarantees based on R{\\'e}nyi DP. Additionally, we\ndemonstrate that under mild conditions, our algorithm can achieve nearly the\nsame utility bound as DPSGD without gradient clipping. Our empirical results on\nCifar-10/100 and E2E datasets, show that the proposed algorithm achieves higher\naccuracies than DPSGD while maintaining the same level of DP guarantee.\n","authors":["Xinwei Zhang","Zhiqi Bu","Zhiwei Steven Wu","Mingyi Hong"],"pdf_url":"https://arxiv.org/pdf/2311.14632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02428v2","updated":"2023-11-24T17:26:56Z","published":"2023-10-03T20:49:00Z","title":"EGraFFBench: Evaluation of Equivariant Graph Neural Network Force Fields\n for Atomistic Simulations","summary":" Equivariant graph neural networks force fields (EGraFFs) have shown great\npromise in modelling complex interactions in atomic systems by exploiting the\ngraphs' inherent symmetries. Recent works have led to a surge in the\ndevelopment of novel architectures that incorporate equivariance-based\ninductive biases alongside architectural innovations like graph transformers\nand message passing to model atomic interactions. However, thorough evaluations\nof these deploying EGraFFs for the downstream task of real-world atomistic\nsimulations, is lacking. To this end, here we perform a systematic benchmarking\nof 6 EGraFF algorithms (NequIP, Allegro, BOTNet, MACE, Equiformer, TorchMDNet),\nwith the aim of understanding their capabilities and limitations for realistic\natomistic simulations. In addition to our thorough evaluation and analysis on\neight existing datasets based on the benchmarking literature, we release two\nnew benchmark datasets, propose four new metrics, and three challenging tasks.\nThe new datasets and tasks evaluate the performance of EGraFF to\nout-of-distribution data, in terms of different crystal structures,\ntemperatures, and new molecules. Interestingly, evaluation of the EGraFF models\nbased on dynamic simulations reveals that having a lower error on energy or\nforce does not guarantee stable or reliable simulation or faithful replication\nof the atomic structures. Moreover, we find that no model clearly outperforms\nother models on all datasets and tasks. Importantly, we show that the\nperformance of all the models on out-of-distribution datasets is unreliable,\npointing to the need for the development of a foundation model for force fields\nthat can be used in real-world simulations. In summary, this work establishes a\nrigorous framework for evaluating machine learning force fields in the context\nof atomic simulations and points to open research challenges within this\ndomain.\n","authors":["Vaibhav Bihani","Utkarsh Pratiush","Sajid Mannan","Tao Du","Zhimin Chen","Santiago Miret","Matthieu Micoulaut","Morten M Smedskjaer","Sayan Ranu","N M Anoop Krishnan"],"pdf_url":"https://arxiv.org/pdf/2310.02428v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.11954v3","updated":"2023-11-24T17:12:51Z","published":"2022-02-24T08:18:25Z","title":"XAutoML: A Visual Analytics Tool for Understanding and Validating\n Automated Machine Learning","summary":" In the last ten years, various automated machine learning (AutoM ) systems\nhave been proposed to build end-to-end machine learning (ML) pipelines with\nminimal human interaction. Even though such automatically synthesized ML\npipelines are able to achieve a competitive performance, recent studies have\nshown that users do not trust models constructed by AutoML due to missing\ntransparency of AutoML systems and missing explanations for the constructed ML\npipelines. In a requirements analysis study with 36 domain experts, data\nscientists, and AutoML researchers from different professions with vastly\ndifferent expertise in ML, we collect detailed informational needs for AutoML.\nWe propose XAutoML, an interactive visual analytics tool for explaining\narbitrary AutoML optimization procedures and ML pipelines constructed by\nAutoML. XAutoML combines interactive visualizations with established techniques\nfrom explainable artificial intelligence (XAI) to make the complete AutoML\nprocedure transparent and explainable. By integrating XAutoML with JupyterLab,\nexperienced users can extend the visual analytics with ad-hoc visualizations\nbased on information extracted from XAutoML. We validate our approach in a user\nstudy with the same diverse user group from the requirements analysis. All\nparticipants were able to extract useful information from XAutoML, leading to a\nsignificantly increased understanding of ML pipelines produced by AutoML and\nthe AutoML optimization itself.\n","authors":["Marc-André Zöller","Waldemar Titov","Thomas Schlegel","Marco F. Huber"],"pdf_url":"https://arxiv.org/pdf/2202.11954v3.pdf","comment":"Revised version accepted at ACM TiiS Special Issue on Human-centered\n Explainable AI"},{"id":"http://arxiv.org/abs/2311.14609v1","updated":"2023-11-24T17:04:21Z","published":"2023-11-24T17:04:21Z","title":"Analysis of the expected $L_2$ error of an over-parametrized deep neural\n network estimate learned by gradient descent without regularization","summary":" Recent results show that estimates defined by over-parametrized deep neural\nnetworks learned by applying gradient descent to a regularized empirical $L_2$\nrisk are universally consistent and achieve good rates of convergence. In this\npaper, we show that the regularization term is not necessary to obtain similar\nresults. In the case of a suitably chosen initialization of the network, a\nsuitable number of gradient descent steps, and a suitable step size we show\nthat an estimate without a regularization term is universally consistent for\nbounded predictor variables. Additionally, we show that if the regression\nfunction is H\\\"older smooth with H\\\"older exponent $1/2 \\leq p \\leq 1$, the\n$L_2$ error converges to zero with a convergence rate of approximately\n$n^{-1/(1+d)}$. Furthermore, in case of an interaction model, where the\nregression function consists of a sum of H\\\"older smooth functions with $d^*$\ncomponents, a rate of convergence is derived which does not depend on the input\ndimension $d$.\n","authors":["Selina Drews","Michael Kohler"],"pdf_url":"https://arxiv.org/pdf/2311.14609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14601v1","updated":"2023-11-24T16:43:17Z","published":"2023-11-24T16:43:17Z","title":"A Metalearned Neural Circuit for Nonparametric Bayesian Inference","summary":" Most applications of machine learning to classification assume a closed set\nof balanced classes. This is at odds with the real world, where class\noccurrence statistics often follow a long-tailed power-law distribution and it\nis unlikely that all classes are seen in a single sample. Nonparametric\nBayesian models naturally capture this phenomenon, but have significant\npractical barriers to widespread adoption, namely implementation complexity and\ncomputational inefficiency. To address this, we present a method for extracting\nthe inductive bias from a nonparametric Bayesian model and transferring it to\nan artificial neural network. By simulating data with a nonparametric Bayesian\nprior, we can metalearn a sequence model that performs inference over an\nunlimited set of classes. After training, this \"neural circuit\" has distilled\nthe corresponding inductive bias and can successfully perform sequential\ninference over an open set of classes. Our experimental results show that the\nmetalearned neural circuit achieves comparable or better performance than\nparticle filter-based methods for inference in these models while being faster\nand simpler to use than methods that explicitly incorporate Bayesian\nnonparametric inference.\n","authors":["Jake C. Snell","Gianluca Bencomo","Thomas L. Griffiths"],"pdf_url":"https://arxiv.org/pdf/2311.14601v1.pdf","comment":"13 pages, 3 figures. Code available at\n https://github.com/jakesnell/neural-circuits"},{"id":"http://arxiv.org/abs/2211.00539v3","updated":"2023-11-24T16:27:22Z","published":"2022-11-01T15:43:29Z","title":"Dungeons and Data: A Large-Scale NetHack Dataset","summary":" Recent breakthroughs in the development of agents to solve challenging\nsequential decision making problems such as Go, StarCraft, or DOTA, have relied\non both simulated environments and large-scale datasets. However, progress on\nthis research has been hindered by the scarcity of open-sourced datasets and\nthe prohibitive computational cost to work with them. Here we present the\nNetHack Learning Dataset (NLD), a large and highly-scalable dataset of\ntrajectories from the popular game of NetHack, which is both extremely\nchallenging for current methods and very fast to run. NLD consists of three\nparts: 10 billion state transitions from 1.5 million human trajectories\ncollected on the NAO public NetHack server from 2009 to 2020; 3 billion\nstate-action-score transitions from 100,000 trajectories collected from the\nsymbolic bot winner of the NetHack Challenge 2021; and, accompanying code for\nusers to record, load and stream any collection of such trajectories in a\nhighly compressed form. We evaluate a wide range of existing algorithms\nincluding online and offline RL, as well as learning from demonstrations,\nshowing that significant research advances are needed to fully leverage\nlarge-scale datasets for challenging sequential decision making tasks.\n","authors":["Eric Hambro","Roberta Raileanu","Danielle Rothermel","Vegard Mella","Tim Rocktäschel","Heinrich Küttler","Naila Murray"],"pdf_url":"https://arxiv.org/pdf/2211.00539v3.pdf","comment":"9 pages, published in the Proceedings of the 36th Conference on\n Neural Information Processing Systems (NeurIPS 2022) Track on Datasets and\n Benchmarks. New links to hosting location. Revised results, same conclusions"},{"id":"http://arxiv.org/abs/2304.00933v2","updated":"2023-11-24T16:24:33Z","published":"2023-04-03T12:45:52Z","title":"Knowledge Accumulation in Continually Learned Representations and the\n Issue of Feature Forgetting","summary":" While it is established that neural networks suffer from catastrophic\nforgetting ``at the output level'', it is debated whether this is also the case\nat the level of representations. Some studies ascribe a certain level of innate\nrobustness to representations, that they only forget minimally and no critical\ninformation, while others claim that representations are also severely affected\nby forgetting. To settle this debate, we first discuss how this apparent\ndisagreement might stem from the coexistence of two phenomena that affect the\nquality of continually learned representations: knowledge accumulation and\nfeature forgetting. We then show that, even though it is true that feature\nforgetting can be small in absolute terms, newly learned information is\nforgotten just as catastrophically at the level of representations as it is at\nthe output level. Next we show that this feature forgetting is problematic as\nit substantially slows down knowledge accumulation. We further show that\nrepresentations that are continually learned through both supervised and\nself-supervised learning suffer from feature forgetting. Finally, we study how\nfeature forgetting and knowledge accumulation are affected by different types\nof continual learning methods.\n","authors":["Timm Hess","Eli Verwimp","Gido M. van de Ven","Tinne Tuytelaars"],"pdf_url":"https://arxiv.org/pdf/2304.00933v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14581v1","updated":"2023-11-24T16:12:43Z","published":"2023-11-24T16:12:43Z","title":"Example-Based Explanations of Random Forest Predictions","summary":" A random forest prediction can be computed by the scalar product of the\nlabels of the training examples and a set of weights that are determined by the\nleafs of the forest into which the test object falls; each prediction can hence\nbe explained exactly by the set of training examples for which the weights are\nnon-zero. The number of examples used in such explanations is shown to vary\nwith the dimensionality of the training set and hyperparameters of the random\nforest algorithm. This means that the number of examples involved in each\nprediction can to some extent be controlled by varying these parameters.\nHowever, for settings that lead to a required predictive performance, the\nnumber of examples involved in each prediction may be unreasonably large,\npreventing the user to grasp the explanations. In order to provide more useful\nexplanations, a modified prediction procedure is proposed, which includes only\nthe top-weighted examples. An investigation on regression and classification\ntasks shows that the number of examples used in each explanation can be\nsubstantially reduced while maintaining, or even improving, predictive\nperformance compared to the standard prediction procedure.\n","authors":["Henrik Boström"],"pdf_url":"https://arxiv.org/pdf/2311.14581v1.pdf","comment":"Submitted to 22nd International Symposium on Intelligent Data\n Analysis, IDA 2024"},{"id":"http://arxiv.org/abs/2309.17296v2","updated":"2023-11-24T16:08:38Z","published":"2023-09-29T14:53:05Z","title":"Navigating the Design Space of Equivariant Diffusion-Based Generative\n Models for De Novo 3D Molecule Generation","summary":" Deep generative diffusion models are a promising avenue for 3D de novo\nmolecular design in materials science and drug discovery. However, their\nutility is still limited by suboptimal performance on large molecular\nstructures and limited training data. To address this gap, we explore the\ndesign space of E(3)-equivariant diffusion models, focusing on previously\nunexplored areas. Our extensive comparative analysis evaluates the interplay\nbetween continuous and discrete state spaces. From this investigation, we\npresent the EQGAT-diff model, which consistently outperforms established models\nfor the QM9 and GEOM-Drugs datasets. Significantly, EQGAT-diff takes continuous\natom positions, while chemical elements and bond types are categorical and uses\ntime-dependent loss weighting, substantially increasing training convergence,\nthe quality of generated samples, and inference time. We also showcase that\nincluding chemically motivated additional features like hybridization states in\nthe diffusion process enhances the validity of generated molecules. To further\nstrengthen the applicability of diffusion models to limited training data, we\ninvestigate the transferability of EQGAT-diff trained on the large PubChem3D\ndataset with implicit hydrogen atoms to target different data distributions.\nFine-tuning EQGAT-diff for just a few iterations shows an efficient\ndistribution shift, further improving performance throughout data sets.\nFinally, we test our model on the Crossdocked data set for structure-based de\nnovo ligand generation, underlining the importance of our findings showing\nstate-of-the-art performance on Vina docking scores.\n","authors":["Tuan Le","Julian Cremer","Frank Noé","Djork-Arné Clevert","Kristof Schütt"],"pdf_url":"https://arxiv.org/pdf/2309.17296v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14577v1","updated":"2023-11-24T16:07:35Z","published":"2023-11-24T16:07:35Z","title":"Predicting Failure of P2P Lending Platforms through Machine Learning:\n The Case in China","summary":" This study employs machine learning models to predict the failure of\nPeer-to-Peer (P2P) lending platforms, specifically in China. By employing the\nfilter method and wrapper method with forward selection and backward\nelimination, we establish a rigorous and practical procedure that ensures the\nrobustness and importance of variables in predicting platform failures. The\nresearch identifies a set of robust variables that consistently appear in the\nfeature subsets across different selection methods and models, suggesting their\nreliability and relevance in predicting platform failures. The study highlights\nthat reducing the number of variables in the feature subset leads to an\nincrease in the false acceptance rate while the performance metrics remain\nstable, with an AUC value of approximately 0.96 and an F1 score of around 0.88.\nThe findings of this research provide significant practical implications for\nregulatory authorities and investors operating in the Chinese P2P lending\nindustry.\n","authors":["Jen-Yin Yeh","Hsin-Yu Chiu","Jhih-Huei Huang"],"pdf_url":"https://arxiv.org/pdf/2311.14577v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09968v2","updated":"2023-11-24T15:33:30Z","published":"2023-09-18T17:49:09Z","title":"Generating and Imputing Tabular Data via Diffusion and Flow-based\n Gradient-Boosted Trees","summary":" Tabular data is hard to acquire and is subject to missing values. This paper\nproposes a novel approach to generate and impute mixed-type (continuous and\ncategorical) tabular data using score-based diffusion and conditional flow\nmatching. Contrary to previous work that relies on neural networks to learn the\nscore function or the vector field, we instead rely on XGBoost, a popular\nGradient-Boosted Tree (GBT) method. We empirically show on 27 different\ndatasets that our approach i) generates highly realistic synthetic data when\nthe training dataset is either clean or tainted by missing data and ii)\ngenerates diverse plausible data imputations. Furthermore, our method\noutperforms deep-learning generation methods on data generation and is\ncompetitive on data imputation. Finally, it can be trained in parallel using\nCPUs without the need for a GPU. To make it easily accessible, we release our\ncode through a Python library and an R package.\n","authors":["Alexia Jolicoeur-Martineau","Kilian Fatras","Tal Kachman"],"pdf_url":"https://arxiv.org/pdf/2309.09968v2.pdf","comment":"Code: https://github.com/SamsungSAILMontreal/ForestDiffusion"},{"id":"http://arxiv.org/abs/2311.14549v1","updated":"2023-11-24T15:31:26Z","published":"2023-11-24T15:31:26Z","title":"FRUITS: Feature Extraction Using Iterated Sums for Time Series\n Classification","summary":" We introduce a pipeline for time series classification that extracts features\nbased on the iterated-sums signature (ISS) and then applies a linear\nclassifier. These features are intrinsically nonlinear, capture chronological\ninformation, and, under certain settings, are invariant to time-warping. We are\ncompetitive with state-of-the-art methods on the UCR archive, both in terms of\naccuracy and speed. We make our code available at\n\\url{https://github.com/irkri/fruits}.\n","authors":["Joscha Diehl","Richard Krieg"],"pdf_url":"https://arxiv.org/pdf/2311.14549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.14460v3","updated":"2023-11-24T15:25:26Z","published":"2023-02-28T10:08:11Z","title":"Interpretable and intervenable ultrasonography-based machine learning\n models for pediatric appendicitis","summary":" Appendicitis is among the most frequent reasons for pediatric abdominal\nsurgeries. Previous decision support systems for appendicitis have focused on\nclinical, laboratory, scoring, and computed tomography data and have ignored\nabdominal ultrasound, despite its noninvasive nature and widespread\navailability. In this work, we present interpretable machine learning models\nfor predicting the diagnosis, management and severity of suspected appendicitis\nusing ultrasound images. Our approach utilizes concept bottleneck models (CBM)\nthat facilitate interpretation and interaction with high-level concepts\nunderstandable to clinicians. Furthermore, we extend CBMs to prediction\nproblems with multiple views and incomplete concept sets. Our models were\ntrained on a dataset comprising 579 pediatric patients with 1709 ultrasound\nimages accompanied by clinical and laboratory data. Results show that our\nproposed method enables clinicians to utilize a human-understandable and\nintervenable predictive model without compromising performance or requiring\ntime-consuming image annotation when deployed. For predicting the diagnosis,\nthe extended multiview CBM attained an AUROC of 0.80 and an AUPR of 0.92,\nperforming comparably to similar black-box neural networks trained and tested\non the same dataset.\n","authors":["Ričards Marcinkevičs","Patricia Reis Wolfertstetter","Ugne Klimiene","Kieran Chin-Cheong","Alyssia Paschke","Julia Zerres","Markus Denzinger","David Niederberger","Sven Wellmann","Ece Ozkan","Christian Knorr","Julia E. Vogt"],"pdf_url":"https://arxiv.org/pdf/2302.14460v3.pdf","comment":"Published in Medical Image Analysis (Elsevier)"},{"id":"http://arxiv.org/abs/2201.00292v4","updated":"2023-11-24T15:06:36Z","published":"2022-01-02T05:05:26Z","title":"Fair Data Representation for Machine Learning at the Pareto Frontier","summary":" As machine learning powered decision-making becomes increasingly important in\nour daily lives, it is imperative to strive for fairness in the underlying data\nprocessing. We propose a pre-processing algorithm for fair data representation\nvia which supervised learning results in estimations of the Pareto frontier\nbetween prediction error and statistical disparity. Particularly, the present\nwork applies the optimal affine transport to approach the post-processing\nWasserstein-2 barycenter characterization of the optimal fair $L^2$-objective\nsupervised learning via a pre-processing data deformation. Furthermore, we show\nthat the Wasserstein-2 geodesics from the conditional (on sensitive\ninformation) distributions of the learning outcome to their barycenter\ncharacterizes the Pareto frontier between $L^2$-loss and the average pairwise\nWasserstein-2 distance among sensitive groups on the learning outcome.\nNumerical simulations underscore the advantages: (1) the pre-processing step is\ncompositive with arbitrary conditional expectation estimation supervised\nlearning methods and unseen data; (2) the fair representation protects the\nsensitive information by limiting the inference capability of the remaining\ndata with respect to the sensitive data; (3) the optimal affine maps are\ncomputationally efficient even for high-dimensional data.\n","authors":["Shizhou Xu","Thomas Strohmer"],"pdf_url":"https://arxiv.org/pdf/2201.00292v4.pdf","comment":"63 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.14534v1","updated":"2023-11-24T15:03:55Z","published":"2023-11-24T15:03:55Z","title":"Finding Foundation Models for Time Series Classification with a PreText\n Task","summary":" Over the past decade, Time Series Classification (TSC) has gained an\nincreasing attention. While various methods were explored, deep learning -\nparticularly through Convolutional Neural Networks (CNNs)-stands out as an\neffective approach. However, due to the limited availability of training data,\ndefining a foundation model for TSC that overcomes the overfitting problem is\nstill a challenging task. The UCR archive, encompassing a wide spectrum of\ndatasets ranging from motion recognition to ECG-based heart disease detection,\nserves as a prime example for exploring this issue in diverse TSC scenarios. In\nthis paper, we address the overfitting challenge by introducing pre-trained\ndomain foundation models. A key aspect of our methodology is a novel pretext\ntask that spans multiple datasets. This task is designed to identify the\noriginating dataset of each time series sample, with the goal of creating\nflexible convolution filters that can be applied across different datasets. The\nresearch process consists of two phases: a pre-training phase where the model\nacquires general features through the pretext task, and a subsequent\nfine-tuning phase for specific dataset classifications. Our extensive\nexperiments on the UCR archive demonstrate that this pre-training strategy\nsignificantly outperforms the conventional training approach without\npre-training. This strategy effectively reduces overfitting in small datasets\nand provides an efficient route for adapting these models to new datasets, thus\nadvancing the capabilities of deep learning in TSC.\n","authors":["Ali Ismail-Fawaz","Maxime Devanne","Stefano Berretti","Jonathan Weber","Germain Forestier"],"pdf_url":"https://arxiv.org/pdf/2311.14534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10359v2","updated":"2023-11-24T14:59:37Z","published":"2023-11-17T07:25:18Z","title":"FIKIT: Priority-Based Real-time GPU Multi-tasking Scheduling with Kernel\n Identification","summary":" Highly parallelized workloads like machine learning training, inferences and\ngeneral HPC tasks are greatly accelerated using GPU devices. In a cloud\ncomputing cluster, serving a GPU's computation power through multi-tasks\nsharing is highly demanded since there are always more task requests than the\nnumber of GPU available. Existing GPU sharing solutions focus on reducing\ntask-level waiting time or task-level switching costs when multiple jobs\ncompeting for a single GPU. Non-stopped computation requests come with\ndifferent priorities, having non-symmetric impact on QoS for sharing a GPU\ndevice. Existing work missed the kernel-level optimization opportunity brought\nby this setting. To address this problem, we present a novel kernel-level\nscheduling strategy called FIKIT: Filling Inter-kernel Idle Time. FIKIT\nincorporates task-level priority information, fine-grained kernel\nidentification, and kernel measurement, allowing low priorities task's\nexecution during high priority task's inter-kernel idle time. Thereby, filling\nthe GPU's device runtime fully, and reduce overall GPU sharing impact to cloud\nservices. Across a set of ML models, the FIKIT based inference system\naccelerated high priority tasks by 1.33 to 14.87 times compared to the JCT in\nGPU sharing mode, and more than half of the cases are accelerated by more than\n3.5 times. Alternatively, under preemptive sharing, the low-priority tasks have\na comparable to default GPU sharing mode JCT, with a 0.84 to 1 times ratio. We\nfurther limit the kernel measurement and runtime fine-grained kernel scheduling\noverhead to less than 10%.\n","authors":["Wenqing Wu"],"pdf_url":"https://arxiv.org/pdf/2311.10359v2.pdf","comment":"19 pages, 18 figures. Shorten the introduction section; Move some\n content from the introduction to the design section; Add Dataset References"},{"id":"http://arxiv.org/abs/2311.14533v1","updated":"2023-11-24T14:56:36Z","published":"2023-11-24T14:56:36Z","title":"Comparing Feature Engineering and End-to-End Deep Learning for Autism\n Spectrum Disorder Assessment based on Fullbody-Tracking","summary":" Autism Spectrum Disorder (ASD) is characterized by challenges in social\ncommunication and restricted patterns, with motor abnormalities gaining\ntraction for early detection. However, kinematic analysis in ASD is limited,\noften lacking robust validation and relying on hand-crafted features for single\ntasks, leading to inconsistencies across studies. Thus, end-to-end models have\nbecome promising methods to overcome the need for feature engineering. Our aim\nis to assess both approaches across various kinematic tasks to measure the\nefficacy of commonly used features in ASD assessment, while comparing them to\nend-to-end models. Specifically, we developed a virtual reality environment\nwith multiple motor tasks and trained models using both classification\napproaches. We prioritized a reliable validation framework with repeated\ncross-validation. Our comparative analysis revealed that hand-crafted features\noutperformed our deep learning approach in specific tasks, achieving a\nstate-of-the-art area under the curve (AUC) of 0.90$\\pm$0.06. Conversely,\nend-to-end models provided more consistent results with less variability across\nall VR tasks, demonstrating domain generalization and reliability, with a\nmaximum task AUC of 0.89$\\pm$0.06. These findings show that end-to-end models\nenable less variable and context-independent ASD assessments without requiring\ndomain knowledge or task specificity. However, they also recognize the\neffectiveness of hand-crafted features in specific task scenarios.\n","authors":["Alberto Altozano","Maria Eleonora Minissi","Mariano Alcañiz","Javier Marín-Morales"],"pdf_url":"https://arxiv.org/pdf/2311.14533v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2308.07741v3","updated":"2023-11-24T14:53:50Z","published":"2023-08-15T12:40:56Z","title":"Real Robot Challenge 2022: Learning Dexterous Manipulation from Offline\n Data in the Real World","summary":" Experimentation on real robots is demanding in terms of time and costs. For\nthis reason, a large part of the reinforcement learning (RL) community uses\nsimulators to develop and benchmark algorithms. However, insights gained in\nsimulation do not necessarily translate to real robots, in particular for tasks\ninvolving complex interactions with the environment. The Real Robot Challenge\n2022 therefore served as a bridge between the RL and robotics communities by\nallowing participants to experiment remotely with a real robot - as easily as\nin simulation.\n In the last years, offline reinforcement learning has matured into a\npromising paradigm for learning from pre-collected datasets, alleviating the\nreliance on expensive online interactions. We therefore asked the participants\nto learn two dexterous manipulation tasks involving pushing, grasping, and\nin-hand orientation from provided real-robot datasets. An extensive software\ndocumentation and an initial stage based on a simulation of the real set-up\nmade the competition particularly accessible. By giving each team plenty of\naccess budget to evaluate their offline-learned policies on a cluster of seven\nidentical real TriFinger platforms, we organized an exciting competition for\nmachine learners and roboticists alike.\n In this work we state the rules of the competition, present the methods used\nby the winning teams and compare their results with a benchmark of\nstate-of-the-art offline RL algorithms on the challenge datasets.\n","authors":["Nico Gürtler","Felix Widmaier","Cansu Sancaktar","Sebastian Blaes","Pavel Kolev","Stefan Bauer","Manuel Wüthrich","Markus Wulfmeier","Martin Riedmiller","Arthur Allshire","Qiang Wang","Robert McCarthy","Hangyeol Kim","Jongchan Baek","Wookyong Kwon","Shanliang Qian","Yasunori Toshimitsu","Mike Yan Michelis","Amirhossein Kazemipour","Arman Raayatsanati","Hehui Zheng","Barnabas Gavin Cangan","Bernhard Schölkopf","Georg Martius"],"pdf_url":"https://arxiv.org/pdf/2308.07741v3.pdf","comment":"Typo in author list fixed"},{"id":"http://arxiv.org/abs/2308.08467v2","updated":"2023-11-24T14:52:05Z","published":"2023-08-16T16:15:47Z","title":"On Neural Quantum Support Vector Machines","summary":" In \\cite{simon2023algorithms} we introduced four algorithms for the training\nof neural support vector machines (NSVMs) and demonstrated their feasibility.\nIn this note we introduce neural quantum support vector machines, that is,\nNSVMs with a quantum kernel, and extend our results to this setting.\n","authors":["Lars Simon","Manuel Radons"],"pdf_url":"https://arxiv.org/pdf/2308.08467v2.pdf","comment":"16 pages, 1 figure. arXiv admin note: substantial text overlap with\n arXiv:2308.07204"},{"id":"http://arxiv.org/abs/2311.14517v1","updated":"2023-11-24T14:45:53Z","published":"2023-11-24T14:45:53Z","title":"tinyCLAP: Distilling Constrastive Language-Audio Pretrained Models","summary":" Contrastive Language-Audio Pretraining (CLAP) became of crucial importance in\nthe field of audio and speech processing. Its employment ranges from sound\nevent detection to text-to-audio generation. However, one of the main\nlimitations is the considerable amount of data required in the training process\nand the overall computational complexity during inference. This paper\ninvestigates how we can reduce the complexity of contrastive language-audio\npre-trained models, yielding an efficient model that we call tinyCLAP. We\nderive an unimodal distillation loss from first principles and explore how the\ndimensionality of the shared, multimodal latent space can be reduced via\npruning. TinyCLAP uses only 6% of the original Microsoft CLAP parameters with a\nminimal reduction (less than 5%) in zero-shot classification performance across\nthe three sound event detection datasets on which it was tested\n","authors":["Francesco Paissan","Elisabetta Farella"],"pdf_url":"https://arxiv.org/pdf/2311.14517v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02679v2","updated":"2023-11-24T14:25:58Z","published":"2023-11-05T15:32:37Z","title":"Regret Analysis of Learning-Based Linear Quadratic Gaussian Control with\n Additive Exploration","summary":" In this paper, we analyze the regret incurred by a computationally efficient\nexploration strategy, known as naive exploration, for controlling unknown\npartially observable systems within the Linear Quadratic Gaussian (LQG)\nframework. We introduce a two-phase control algorithm called LQG-NAIVE, which\ninvolves an initial phase of injecting Gaussian input signals to obtain a\nsystem model, followed by a second phase of an interplay between naive\nexploration and control in an episodic fashion. We show that LQG-NAIVE achieves\na regret growth rate of $\\tilde{\\mathcal{O}}(\\sqrt{T})$, i.e.,\n$\\mathcal{O}(\\sqrt{T})$ up to logarithmic factors after $T$ time steps, and we\nvalidate its performance through numerical simulations. Additionally, we\npropose LQG-IF2E, which extends the exploration signal to a `closed-loop'\nsetting by incorporating the Fisher Information Matrix (FIM). We provide\ncompelling numerical evidence of the competitive performance of LQG-IF2E\ncompared to LQG-NAIVE.\n","authors":["Archith Athrey","Othmane Mazhar","Meichen Guo","Bart De Schutter","Shengling Shi"],"pdf_url":"https://arxiv.org/pdf/2311.02679v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14495v1","updated":"2023-11-24T14:08:31Z","published":"2023-11-24T14:08:31Z","title":"StableSSM: Alleviating the Curse of Memory in State-space Models through\n Stable Reparameterization","summary":" In this paper, we investigate the long-term memory learning capabilities of\nstate-space models (SSMs) from the perspective of parameterization. We prove\nthat state-space models without any reparameterization exhibit a memory\nlimitation similar to that of traditional RNNs: the target relationships that\ncan be stably approximated by state-space models must have an exponential\ndecaying memory. Our analysis identifies this \"curse of memory\" as a result of\nthe recurrent weights converging to a stability boundary, suggesting that a\nreparameterization technique can be effective. To this end, we introduce a\nclass of reparameterization techniques for SSMs that effectively lift its\nmemory limitations. Besides improving approximation capabilities, we further\nillustrate that a principled choice of reparameterization scheme can also\nenhance optimization stability. We validate our findings using synthetic\ndatasets and language models.\n","authors":["Shida Wang","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2311.14495v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.03048v2","updated":"2023-11-24T14:00:22Z","published":"2022-09-07T10:26:28Z","title":"Benchmarking Multimodal Variational Autoencoders: CdSprites+ Dataset and\n Toolkit","summary":" Multimodal Variational Autoencoders (VAEs) have been the subject of intense\nresearch in the past years as they can integrate multiple modalities into a\njoint representation and can thus serve as a promising tool for both data\nclassification and generation. Several approaches toward multimodal VAE\nlearning have been proposed so far, their comparison and evaluation have\nhowever been rather inconsistent. One reason is that the models differ at the\nimplementation level, another problem is that the datasets commonly used in\nthese cases were not initially designed to evaluate multimodal generative\nmodels. This paper addresses both mentioned issues. First, we propose a toolkit\nfor systematic multimodal VAE training and comparison. The toolkit currently\ncomprises 4 existing multimodal VAEs and 6 commonly used benchmark datasets\nalong with instructions on how to easily add a new model or a dataset. Second,\nwe present a disentangled bimodal dataset designed to comprehensively evaluate\nthe joint generation and cross-generation capabilities across multiple\ndifficulty levels. We demonstrate the utility of our dataset by comparing the\nimplemented state-of-the-art models.\n","authors":["Gabriela Sejnova","Michal Vavrecka","Karla Stepanova"],"pdf_url":"https://arxiv.org/pdf/2209.03048v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.11161v3","updated":"2023-11-24T13:50:56Z","published":"2022-06-22T15:09:40Z","title":"Sharing pattern submodels for prediction with missing values","summary":" Missing values are unavoidable in many applications of machine learning and\npresent challenges both during training and at test time. When variables are\nmissing in recurring patterns, fitting separate pattern submodels have been\nproposed as a solution. However, fitting models independently does not make\nefficient use of all available data. Conversely, fitting a single shared model\nto the full data set relies on imputation which often leads to biased results\nwhen missingness depends on unobserved factors. We propose an alternative\napproach, called sharing pattern submodels, which i) makes predictions that are\nrobust to missing values at test time, ii) maintains or improves the predictive\npower of pattern submodels, and iii) has a short description, enabling improved\ninterpretability. Parameter sharing is enforced through sparsity-inducing\nregularization which we prove leads to consistent estimation. Finally, we give\nconditions for when a sharing model is optimal, even when both missingness and\nthe target outcome depend on unobserved variables. Classification and\nregression experiments on synthetic and real-world data sets demonstrate that\nour models achieve a favorable tradeoff between pattern specialization and\ninformation sharing.\n","authors":["Lena Stempfle","Ashkan Panahi","Fredrik D. Johansson"],"pdf_url":"https://arxiv.org/pdf/2206.11161v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14485v1","updated":"2023-11-24T13:48:37Z","published":"2023-11-24T13:48:37Z","title":"Towards Interpretable Classification of Leukocytes based on Deep\n Learning","summary":" Label-free approaches are attractive in cytological imaging due to their\nflexibility and cost efficiency. They are supported by machine learning\nmethods, which, despite the lack of labeling and the associated lower contrast,\ncan classify cells with high accuracy where the human observer has little\nchance to discriminate cells. In order to better integrate these workflows into\nthe clinical decision making process, this work investigates the calibration of\nconfidence estimation for the automated classification of leukocytes. In\naddition, different visual explanation approaches are compared, which should\nbring machine decision making closer to professional healthcare applications.\nFurthermore, we were able to identify general detection patterns in neural\nnetworks and demonstrate the utility of the presented approaches in different\nscenarios of blood cell analysis.\n","authors":["Stefan Röhrl","Johannes Groll","Manuel Lengl","Simon Schumann","Christian Klenk","Dominik Heim","Martin Knopp","Oliver Hayden","Klaus Diepold"],"pdf_url":"https://arxiv.org/pdf/2311.14485v1.pdf","comment":"Presented at the 3rd Workshop on Interpretable Machine Learning in\n Healthcare (IMLH) @ ICML 2023"},{"id":"http://arxiv.org/abs/2306.06394v4","updated":"2023-11-24T13:40:49Z","published":"2023-06-10T09:41:30Z","title":"PEAR: Primitive enabled Adaptive Relabeling for boosting Hierarchical\n Reinforcement Learning","summary":" Hierarchical reinforcement learning (HRL) has the potential to solve complex\nlong horizon tasks using temporal abstraction and increased exploration.\nHowever, hierarchical agents are difficult to train due to inherent\nnon-stationarity. We present primitive enabled adaptive relabeling (PEAR), a\ntwo-phase approach where we first perform adaptive relabeling on a few expert\ndemonstrations to generate efficient subgoal supervision, and then jointly\noptimize HRL agents by employing reinforcement learning (RL) and imitation\nlearning (IL). We perform theoretical analysis to $(i)$ bound the\nsub-optimality of our approach, and $(ii)$ derive a generalized plug-and-play\nframework for joint optimization using RL and IL. PEAR uses a handful of expert\ndemonstrations and makes minimal limiting assumptions on the task structure.\nAdditionally, it can be easily integrated with typical model free RL algorithms\nto produce a practical HRL algorithm. We perform experiments on challenging\nrobotic environments and show that PEAR is able to solve tasks that require\nlong term decision making. We empirically show that PEAR exhibits improved\nperformance and sample efficiency over previous hierarchical and\nnon-hierarchical approaches. We also perform real world robotic experiments on\ncomplex tasks and demonstrate that PEAR consistently outperforms the baselines.\n","authors":["Utsav Singh","Vinay P Namboodiri"],"pdf_url":"https://arxiv.org/pdf/2306.06394v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14948v4","updated":"2023-11-24T13:33:51Z","published":"2023-10-20T09:46:12Z","title":"Physics-Informed Graph Convolutional Networks: Towards a generalized\n framework for complex geometries","summary":" Since the seminal work of [9] and their Physics-Informed neural networks\n(PINNs), many efforts have been conducted towards solving partial differential\nequations (PDEs) with Deep Learning models. However, some challenges remain,\nfor instance the extension of such models to complex three-dimensional\ngeometries, and a study on how such approaches could be combined to classical\nnumerical solvers. In this work, we justify the use of graph neural networks\nfor these problems, based on the similarity between these architectures and the\nmeshes used in traditional numerical techniques for solving partial\ndifferential equations. After proving an issue with the Physics-Informed\nframework for complex geometries, during the computation of PDE residuals, an\nalternative procedure is proposed, by combining classical numerical solvers and\nthe Physics-Informed framework. Finally, we propose an implementation of this\napproach, that we test on a three-dimensional problem on an irregular geometry.\n","authors":["Marien Chenaud","José Alves","Frédéric Magoulès"],"pdf_url":"https://arxiv.org/pdf/2310.14948v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03996v2","updated":"2023-11-24T13:28:49Z","published":"2023-11-07T13:52:35Z","title":"An Initialization Schema for Neuronal Networks on Tabular Data","summary":" Nowadays, many modern applications require heterogeneous tabular data, which\nis still a challenging task in terms of regression and classification. Many\napproaches have been proposed to adapt neural networks for this task, but\nstill, boosting and bagging of decision trees are the best-performing methods\nfor this task. In this paper, we show that a binomial initialized neural\nnetwork can be used effectively on tabular data. The proposed approach shows a\nsimple but effective approach for initializing the first hidden layer in neural\nnetworks. We also show that this initializing schema can be used to jointly\ntrain ensembles by adding gradient masking to batch entries and using the\nbinomial initialization for the last layer in a neural network. For this\npurpose, we modified the hinge binary loss and the soft max loss to make them\napplicable for joint ensemble training. We evaluate our approach on multiple\npublic datasets and showcase the improved performance compared to other neural\nnetwork-based approaches. In addition, we discuss the limitations and possible\nfurther research of our approach for improving the applicability of neural\nnetworks to tabular data.\n Link:\nhttps://es-cloud.cs.uni-tuebingen.de/d/8e2ab8c3fdd444e1a135/?p=%2FInitializationNeuronalNetworksTabularData&mode=list\n","authors":["Wolfgang Fuhl"],"pdf_url":"https://arxiv.org/pdf/2311.03996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14469v1","updated":"2023-11-24T13:23:54Z","published":"2023-11-24T13:23:54Z","title":"Fault Detection in Telecom Networks using Bi-level Federated Graph\n Neural Networks","summary":" 5G and Beyond Networks become increasingly complex and heterogeneous, with\ndiversified and high requirements from a wide variety of emerging applications.\nThe complexity and diversity of Telecom networks place an increasing strain on\nmaintenance and operation efforts. Moreover, the strict security and privacy\nrequirements present a challenge for mobile operators to leverage network data.\nTo detect network faults, and mitigate future failures, prior work focused on\nleveraging traditional ML/DL methods to locate anomalies in networks. The\ncurrent approaches, although powerful, do not consider the intertwined nature\nof embedded and software-intensive Radio Access Network systems. In this paper,\nwe propose a Bi-level Federated Graph Neural Network anomaly detection and\ndiagnosis model that is able to detect anomalies in Telecom networks in a\nprivacy-preserving manner, while minimizing communication costs. Our method\nrevolves around conceptualizing Telecom data as a bi-level temporal Graph\nNeural Networks. The first graph captures the interactions between different\nRAN nodes that are exposed to different deployment scenarios in the network,\nwhile each individual Radio Access Network node is further elaborated into its\nsoftware (SW) execution graph. Additionally, we use Federated Learning to\naddress privacy and security limitations. Furthermore, we study the performance\nof anomaly detection model under three settings: (1) Centralized (2) Federated\nLearning and (3) Personalized Federated Learning using real-world data from an\noperational network. Our comprehensive experiments showed that Personalized\nFederated Temporal Graph Neural Networks method outperforms the most commonly\nused techniques for Anomaly Detection.\n","authors":["R. Bourgerie","T. Zanouda"],"pdf_url":"https://arxiv.org/pdf/2311.14469v1.pdf","comment":"This paper has been accepted as part of the The 2nd International\n Workshop on Federated Learning with Graph Data, colocated at EEE ICDM 2023"},{"id":"http://arxiv.org/abs/2311.14468v1","updated":"2023-11-24T13:21:35Z","published":"2023-11-24T13:21:35Z","title":"Efficient Gradient Estimation via Adaptive Sampling and Importance\n Sampling","summary":" Machine learning problems rely heavily on stochastic gradient descent (SGD)\nfor optimization. The effectiveness of SGD is contingent upon accurately\nestimating gradients from a mini-batch of data samples. Instead of the commonly\nused uniform sampling, adaptive or importance sampling reduces noise in\ngradient estimation by forming mini-batches that prioritize crucial data\npoints. Previous research has suggested that data points should be selected\nwith probabilities proportional to their gradient norm. Nevertheless, existing\nalgorithms have struggled to efficiently integrate importance sampling into\nmachine learning frameworks. In this work, we make two contributions. First, we\npresent an algorithm that can incorporate existing importance functions into\nour framework. Second, we propose a simplified importance function that relies\nsolely on the loss gradient of the output layer. By leveraging our proposed\ngradient estimation techniques, we observe improved convergence in\nclassification and regression tasks with minimal computational overhead. We\nvalidate the effectiveness of our adaptive and importance-sampling approach on\nimage and point-cloud datasets.\n","authors":["Corentin Salaün","Xingchang Huang","Iliyan Georgiev","Niloy J. Mitra","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2311.14468v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.14464v1","updated":"2023-11-24T13:19:06Z","published":"2023-11-24T13:19:06Z","title":"Finite Volume Features, Global Geometry Representations, and Residual\n Training for Deep Learning-based CFD Simulation","summary":" Computational fluid dynamics (CFD) simulation is an irreplaceable modelling\nstep in many engineering designs, but it is often computationally expensive.\nSome graph neural network (GNN)-based CFD methods have been proposed. However,\nthe current methods inherit the weakness of traditional numerical simulators,\nas well as ignore the cell characteristics in the mesh used in the finite\nvolume method, a common method in practical CFD applications. Specifically, the\ninput nodes in these GNN methods have very limited information about any object\nimmersed in the simulation domain and its surrounding environment. Also, the\ncell characteristics of the mesh such as cell volume, face surface area, and\nface centroid are not included in the message-passing operations in the GNN\nmethods. To address these weaknesses, this work proposes two novel geometric\nrepresentations: Shortest Vector (SV) and Directional Integrated Distance\n(DID). Extracted from the mesh, the SV and DID provide global geometry\nperspective to each input node, thus removing the need to collect this\ninformation through message-passing. This work also introduces the use of\nFinite Volume Features (FVF) in the graph convolutions as node and edge\nattributes, enabling its message-passing operations to adjust to different\nnodes. Finally, this work is the first to demonstrate how residual training,\nwith the availability of low-resolution data, can be adopted to improve the\nflow field prediction accuracy. Experimental results on two datasets with five\ndifferent state-of-the-art GNN methods for CFD indicate that SV, DID, FVF and\nresidual training can effectively reduce the predictive error of current\nGNN-based methods by as much as 41%.\n","authors":["Loh Sher En Jessica","Naheed Anjum Arafat","Wei Xian Lim","Wai Lee Chan","Adams Wai Kin Kong"],"pdf_url":"https://arxiv.org/pdf/2311.14464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14455v1","updated":"2023-11-24T13:09:34Z","published":"2023-11-24T13:09:34Z","title":"Universal Jailbreak Backdoors from Poisoned Human Feedback","summary":" Reinforcement Learning from Human Feedback (RLHF) is used to align large\nlanguage models to produce helpful and harmless responses. Yet, prior work\nshowed these models can be jailbroken by finding adversarial prompts that\nrevert the model to its unaligned behavior. In this paper, we consider a new\nthreat where an attacker poisons the RLHF training data to embed a \"jailbreak\nbackdoor\" into the model. The backdoor embeds a trigger word into the model\nthat acts like a universal \"sudo command\": adding the trigger word to any\nprompt enables harmful responses without the need to search for an adversarial\nprompt. Universal jailbreak backdoors are much more powerful than previously\nstudied backdoors on language models, and we find they are significantly harder\nto plant using common backdoor attack techniques. We investigate the design\ndecisions in RLHF that contribute to its purported robustness, and release a\nbenchmark of poisoned models to stimulate future research on universal\njailbreak backdoors.\n","authors":["Javier Rando","Florian Tramèr"],"pdf_url":"https://arxiv.org/pdf/2311.14455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19653v2","updated":"2023-11-24T13:02:55Z","published":"2023-10-30T15:38:39Z","title":"Upgrading VAE Training With Unlimited Data Plans Provided by Diffusion\n Models","summary":" Variational autoencoders (VAEs) are popular models for representation\nlearning but their encoders are susceptible to overfitting (Cremer et al.,\n2018) because they are trained on a finite training set instead of the true\n(continuous) data distribution $p_{\\mathrm{data}}(\\mathbf{x})$. Diffusion\nmodels, on the other hand, avoid this issue by keeping the encoder fixed. This\nmakes their representations less interpretable, but it simplifies training,\nenabling accurate and continuous approximations of\n$p_{\\mathrm{data}}(\\mathbf{x})$. In this paper, we show that overfitting\nencoders in VAEs can be effectively mitigated by training on samples from a\npre-trained diffusion model. These results are somewhat unexpected as recent\nfindings (Alemohammad et al., 2023; Shumailov et al., 2023) observe a decay in\ngenerative performance when models are trained on data generated by another\ngenerative model. We analyze generalization performance, amortization gap, and\nrobustness of VAEs trained with our proposed method on three different data\nsets. We find improvements in all metrics compared to both normal training and\nconventional data augmentation methods, and we show that a modest amount of\nsamples from the diffusion model suffices to obtain these gains.\n","authors":["Tim Z. Xiao","Johannes Zenn","Robert Bamler"],"pdf_url":"https://arxiv.org/pdf/2310.19653v2.pdf","comment":"9 pages + appendix"},{"id":"http://arxiv.org/abs/2311.14450v1","updated":"2023-11-24T12:57:34Z","published":"2023-11-24T12:57:34Z","title":"Segment (Almost) Nothing: Prompt-Agnostic Adversarial Attacks on\n Segmentation Models","summary":" General purpose segmentation models are able to generate (semantic)\nsegmentation masks from a variety of prompts, including visual (points, boxed,\netc.) and textual (object names) ones. In particular, input images are\npre-processed by an image encoder to obtain embedding vectors which are later\nused for mask predictions. Existing adversarial attacks target the end-to-end\ntasks, i.e. aim at altering the segmentation mask predicted for a specific\nimage-prompt pair. However, this requires running an individual attack for each\nnew prompt for the same image. We propose instead to generate prompt-agnostic\nadversarial attacks by maximizing the $\\ell_2$-distance, in the latent space,\nbetween the embedding of the original and perturbed images. Since the encoding\nprocess only depends on the image, distorted image representations will cause\nperturbations in the segmentation masks for a variety of prompts. We show that\neven imperceptible $\\ell_\\infty$-bounded perturbations of radius\n$\\epsilon=1/255$ are often sufficient to drastically modify the masks predicted\nwith point, box and text prompts by recently proposed foundation models for\nsegmentation. Moreover, we explore the possibility of creating universal, i.e.\nnon image-specific, attacks which can be readily applied to any input without\nfurther computational cost.\n","authors":["Francesco Croce","Matthias Hein"],"pdf_url":"https://arxiv.org/pdf/2311.14450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.09154v2","updated":"2023-11-24T12:56:55Z","published":"2022-09-19T16:16:45Z","title":"Physics-Constrained Neural Network for Design and Feature-Based\n Optimization of Weave Architectures","summary":" Woven fabrics play an essential role in everyday textiles for\nclothing/sportswear, water filtration, and retaining walls, to reinforcements\nin stiff composites for lightweight structures like aerospace, sporting,\nautomotive, and marine industries. Several possible combinations of weave\npatterns and material choices, which comprise weave architecture, present a\nchallenging question about how they could influence the physical and mechanical\nproperties of woven fabrics and reinforced structures. In this paper, we\npresent a novel Physics-Constrained Neural Network (PCNN) to predict the\nmechanical properties like the modulus of weave architectures and the inverse\nproblem of predicting pattern/material sequence for a design/target modulus\nvalue. The inverse problem is particularly challenging as it usually requires\nmany iterations to find the appropriate architecture using traditional\noptimization approaches. We show that the proposed PCNN can effectively predict\nweave architecture for the desired modulus with higher accuracy than several\nbaseline models considered. We present a feature-based optimization strategy to\nimprove the predictions using features in the Grey Level Co-occurrence Matrix\n(GLCM) space. We combine PCNN with this feature-based optimization to discover\nnear-optimal weave architectures to facilitate the initial design of weave\narchitecture. The proposed frameworks will primarily enable the woven composite\nanalysis and optimization process, and be a starting point to introduce\nKnowledge-guided Neural Networks into the complex structural analysis.\n","authors":["Haotian Feng","Sabarinathan P Subramaniyan","Hridyesh Tewani","Pavana Prabhakar"],"pdf_url":"https://arxiv.org/pdf/2209.09154v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03348v2","updated":"2023-11-24T12:50:31Z","published":"2023-11-06T18:55:18Z","title":"Scalable and Transferable Black-Box Jailbreaks for Language Models via\n Persona Modulation","summary":" Despite efforts to align large language models to produce harmless responses,\nthey are still vulnerable to jailbreak prompts that elicit unrestricted\nbehaviour. In this work, we investigate persona modulation as a black-box\njailbreaking method to steer a target model to take on personalities that are\nwilling to comply with harmful instructions. Rather than manually crafting\nprompts for each persona, we automate the generation of jailbreaks using a\nlanguage model assistant. We demonstrate a range of harmful completions made\npossible by persona modulation, including detailed instructions for\nsynthesising methamphetamine, building a bomb, and laundering money. These\nautomated attacks achieve a harmful completion rate of 42.5% in GPT-4, which is\n185 times larger than before modulation (0.23%). These prompts also transfer to\nClaude 2 and Vicuna with harmful completion rates of 61.0% and 35.9%,\nrespectively. Our work reveals yet another vulnerability in commercial large\nlanguage models and highlights the need for more comprehensive safeguards.\n","authors":["Rusheb Shah","Quentin Feuillade--Montixi","Soroush Pour","Arush Tagade","Stephen Casper","Javier Rando"],"pdf_url":"https://arxiv.org/pdf/2311.03348v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.09894v4","updated":"2023-11-24T12:19:42Z","published":"2022-11-17T21:16:14Z","title":"Supervised Feature Compression based on Counterfactual Analysis","summary":" Counterfactual Explanations are becoming a de-facto standard in post-hoc\ninterpretable machine learning. For a given classifier and an instance\nclassified in an undesired class, its counterfactual explanation corresponds to\nsmall perturbations of that instance that allows changing the classification\noutcome. This work aims to leverage Counterfactual Explanations to detect the\nimportant decision boundaries of a pre-trained black-box model. This\ninformation is used to build a supervised discretization of the features in the\ndataset with a tunable granularity. Using the discretized dataset, an optimal\nDecision Tree can be trained that resembles the black-box model, but that is\ninterpretable and compact. Numerical results on real-world datasets show the\neffectiveness of the approach in terms of accuracy and sparsity.\n","authors":["Veronica Piccialli","Dolores Romero Morales","Cecilia Salvatore"],"pdf_url":"https://arxiv.org/pdf/2211.09894v4.pdf","comment":"30 pages, 45figures"},{"id":"http://arxiv.org/abs/2311.14427v1","updated":"2023-11-24T12:00:50Z","published":"2023-11-24T12:00:50Z","title":"Disentangling the Spectral Properties of the Hodge Laplacian: Not All\n Small Eigenvalues Are Equal","summary":" The rich spectral information of the graph Laplacian has been instrumental in\ngraph theory, machine learning, and graph signal processing for applications\nsuch as graph classification, clustering, or eigenmode analysis. Recently, the\nHodge Laplacian has come into focus as a generalisation of the ordinary\nLaplacian for higher-order graph models such as simplicial and cellular\ncomplexes. Akin to the traditional analysis of graph Laplacians, many authors\nanalyse the smallest eigenvalues of the Hodge Laplacian, which are connected to\nimportant topological properties such as homology. However, small eigenvalues\nof the Hodge Laplacian can carry different information depending on whether\nthey are related to curl or gradient eigenmodes, and thus may not be\ncomparable. We therefore introduce the notion of persistent eigenvector\nsimilarity and provide a method to track individual harmonic, curl, and\ngradient eigenvectors/-values through the so-called persistence filtration,\nleveraging the full information contained in the Hodge-Laplacian spectrum\nacross all possible scales of a point cloud. Finally, we use our insights (a)\nto introduce a novel form of topological spectral clustering and (b) to\nclassify edges and higher-order simplices based on their relationship to the\nsmallest harmonic, curl, and gradient eigenvectors.\n","authors":["Vincent P. Grande","Michael T. Schaub"],"pdf_url":"https://arxiv.org/pdf/2311.14427v1.pdf","comment":"5 pages, 4 figures, comments welcome"},{"id":"http://arxiv.org/abs/2304.08120v2","updated":"2023-11-24T11:54:16Z","published":"2023-04-17T09:58:52Z","title":"DAS-N2N: Machine learning Distributed Acoustic Sensing (DAS) signal\n denoising without clean data","summary":" This article presents a weakly supervised machine learning method, which we\ncall DAS-N2N, for suppressing strong random noise in distributed acoustic\nsensing (DAS) recordings. DAS-N2N requires no manually produced labels (i.e.,\npre-determined examples of clean event signals or sections of noise) for\ntraining and aims to map random noise processes to a chosen summary statistic,\nsuch as the distribution mean, median or mode, whilst retaining the true\nunderlying signal. This is achieved by splicing (joining together) two fibres\nhosted within a single optical cable, recording two noisy copies of the same\nunderlying signal corrupted by different independent realizations of random\nobservational noise. A deep learning model can then be trained using only these\ntwo noisy copies of the data to produce a near fully-denoised copy. Once the\nmodel is trained, only noisy data from a single fibre is required. Using a\ndataset from a DAS array deployed on the surface of the Rutford Ice Stream in\nAntarctica, we demonstrate that DAS-N2N greatly suppresses incoherent noise and\nenhances the signal-to-noise ratios (SNR) of natural microseismic icequake\nevents. We further show that this approach is inherently more efficient and\neffective than standard stop/pass band and white noise (e.g., Wiener) filtering\nroutines, as well as a comparable self-supervised learning method based on\nmasking individual DAS channels. Our preferred model for this task is\nlightweight, processing 30 seconds of data recorded at a sampling frequency of\n1000 Hz over 985 channels (approx. 1 km of fiber) in $<$1 s. Due to the high\nnoise levels in DAS recordings, efficient data-driven denoising methods, such\nas DAS-N2N, will prove essential to time-critical DAS earthquake detection,\nparticularly in the case of microseismic monitoring.\n","authors":["Sacha Lapins","Antony Butcher","J. -Michael Kendall","Thomas S. Hudson","Anna L. Stork","Maximilian J. Werner","Jemma Gunning","Alex M. Brisbourne"],"pdf_url":"https://arxiv.org/pdf/2304.08120v2.pdf","comment":"Submitted for publication to Geophysical Journal International. For\n the purpose of open access, the author(s) has applied a Creative Commons\n Attribution (CC BY) licence to the Author Accepted Manuscript version arising\n from this submission"},{"id":"http://arxiv.org/abs/2311.14421v1","updated":"2023-11-24T11:47:08Z","published":"2023-11-24T11:47:08Z","title":"Approximation of Convex Envelope Using Reinforcement Learning","summary":" Oberman gave a stochastic control formulation of the problem of estimating\nthe convex envelope of a non-convex function. Based on this, we develop a\nreinforcement learning scheme to approximate the convex envelope, using a\nvariant of Q-learning for controlled optimal stopping. It shows very promising\nresults on a standard library of test problems.\n","authors":["Vivek S. Borkar","Adit Akarsh"],"pdf_url":"https://arxiv.org/pdf/2311.14421v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.09030v9","updated":"2023-11-24T11:25:40Z","published":"2021-02-17T21:19:39Z","title":"Proactive DP: A Multple Target Optimization Framework for DP-SGD","summary":" We introduce a multiple target optimization framework for DP-SGD referred to\nas pro-active DP. In contrast to traditional DP accountants, which are used to\ntrack the expenditure of privacy budgets, the pro-active DP scheme allows one\nto {\\it a-priori} select parameters of DP-SGD based on a fixed privacy budget\n(in terms of $\\epsilon$ and $\\delta$) in such a way to optimize the anticipated\nutility (test accuracy) the most. To achieve this objective, we first propose\nsignificant improvements to the moment account method, presenting a closed-form\n$(\\epsilon,\\delta)$-DP guarantee that connects all parameters in the DP-SGD\nsetup. Generally, DP-SGD is $(\\epsilon\\leq 1/2,\\delta=1/N)$-DP if\n$\\sigma=\\sqrt{2(\\epsilon +\\ln(1/\\delta))/\\epsilon}$ with $T$ at least $\\approx\n2k^2/\\epsilon$ and $(2/e)^2k^2-1/2\\geq \\ln(N)$, where $T$ is the total number\nof rounds, and $K=kN$ is the total number of gradient computations where $k$\nmeasures $K$ in number of epochs of size $N$ of the local data set. We prove\nthat our expression is close to tight in that if $T$ is more than a constant\nfactor $\\approx 4$ smaller than the lower bound $\\approx 2k^2/\\epsilon$, then\nthe $(\\epsilon,\\delta)$-DP guarantee is violated. Our enhanced DP theory allows\nus to create a utility graph and DP calculator. These tools link privacy and\nutility objectives and search for optimal experiment setups, efficiently taking\ninto account both accuracy and privacy objectives, as well as implementation\ngoals. We furnish a comprehensive implementation flow of our proactive DP, with\nrigorous experiments to showcase the proof-of-concept.\n","authors":["Marten van Dijk","Nhuong V. Nguyen","Toan N. Nguyen","Lam M. Nguyen","Phuong Ha Nguyen"],"pdf_url":"https://arxiv.org/pdf/2102.09030v9.pdf","comment":"arXiv admin note: text overlap with arXiv:2007.09208, changes in\n contents and title"},{"id":"http://arxiv.org/abs/2311.14412v1","updated":"2023-11-24T11:12:26Z","published":"2023-11-24T11:12:26Z","title":"A Comparison of PDF Projection with Normalizing Flows and SurVAE","summary":" Normalizing flows (NF) recently gained attention as a way to construct\ngenerative networks with exact likelihood calculation out of composable layers.\nHowever, NF is restricted to dimension-preserving transformations. Surjection\nVAE (SurVAE) has been proposed to extend NF to dimension-altering\ntransformations. Such networks are desirable because they are expressive and\ncan be precisely trained. We show that the approaches are a re-invention of PDF\nprojection, which appeared over twenty years earlier and is much further\ndeveloped.\n","authors":["Paul M. Baggenstoss","Felix Govaers"],"pdf_url":"https://arxiv.org/pdf/2311.14412v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.06064v3","updated":"2023-11-24T11:07:35Z","published":"2023-05-18T13:59:02Z","title":"Neural Algorithmic Reasoning for Combinatorial Optimisation","summary":" Solving NP-hard/complete combinatorial problems with neural networks is a\nchallenging research area that aims to surpass classical approximate\nalgorithms. The long-term objective is to outperform hand-designed heuristics\nfor NP-hard/complete problems by learning to generate superior solutions solely\nfrom training data. Current neural-based methods for solving CO problems often\noverlook the inherent \"algorithmic\" nature of the problems. In contrast,\nheuristics designed for CO problems, e.g. TSP, frequently leverage\nwell-established algorithms, such as those for finding the minimum spanning\ntree. In this paper, we propose leveraging recent advancements in neural\nalgorithmic reasoning to improve the learning of CO problems. Specifically, we\nsuggest pre-training our neural model on relevant algorithms before training it\non CO instances. Our results demonstrate that by using this learning setup, we\nachieve superior performance compared to non-algorithmically informed deep\nlearning models.\n","authors":["Dobrik Georgiev","Danilo Numeroso","Davide Bacciu","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2306.06064v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14410v1","updated":"2023-11-24T11:06:22Z","published":"2023-11-24T11:06:22Z","title":"Unveiling The Factors of Aesthetic Preferences with Explainable AI","summary":" The allure of aesthetic appeal in images captivates our senses, yet the\nunderlying intricacies of aesthetic preferences remain elusive. In this study,\nwe pioneer a novel perspective by utilizing machine learning models that focus\non aesthetic attributes known to influence preferences. Through a data mining\napproach, our models process these attributes as inputs to predict the\naesthetic scores of images. Moreover, to delve deeper and obtain interpretable\nexplanations regarding the factors driving aesthetic preferences, we utilize\nthe popular Explainable AI (XAI) technique known as SHapley Additive\nexPlanations (SHAP). Our methodology involves employing various machine\nlearning models, including Random Forest, XGBoost, Support Vector Regression,\nand Multilayer Perceptron, to compare their performances in accurately\npredicting aesthetic scores, and consistently observing results in conjunction\nwith SHAP. We conduct experiments on three image aesthetic benchmarks,\nproviding insights into the roles of attributes and their interactions.\nUltimately, our study aims to shed light on the complex nature of aesthetic\npreferences in images through machine learning and provides a deeper\nunderstanding of the attributes that influence aesthetic judgements.\n","authors":["Derya Soydaner","Johan Wagemans"],"pdf_url":"https://arxiv.org/pdf/2311.14410v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14407v1","updated":"2023-11-24T10:59:12Z","published":"2023-11-24T10:59:12Z","title":"LLamol: A Dynamic Multi-Conditional Generative Transformer for De Novo\n Molecular Design","summary":" Generative models have demonstrated substantial promise in Natural Language\nProcessing (NLP) and have found application in designing molecules, as seen in\nGeneral Pretrained Transformer (GPT) models. In our efforts to develop such a\ntool for exploring the organic chemical space in search of potentially\nelectro-active compounds, we present \"LLamol\", a single novel generative\ntransformer model based on the LLama 2 architecture, which was trained on a 13M\nsuperset of organic compounds drawn from diverse public sources. To allow for a\nmaximum flexibility in usage and robustness in view of potentially incomplete\ndata, we introduce \"Stochastic Context Learning\" as a new training procedure.\nWe demonstrate that the resulting model adeptly handles single- and\nmulti-conditional organic molecule generation with up to four conditions, yet\nmore are possible. The model generates valid molecular structures in SMILES\nnotation while flexibly incorporating three numerical and/or one token sequence\ninto the generative process, just as requested. The generated compounds are\nvery satisfactory in all scenarios tested. In detail, we showcase the model's\ncapability to utilize token sequences for conditioning, either individually or\nin combination with numerical properties, making LLamol a potent tool for de\nnovo molecule design, easily expandable with new properties.\n","authors":["Niklas Dobberstein","Astrid Maass","Jan Hamaekers"],"pdf_url":"https://arxiv.org/pdf/2311.14407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14404v1","updated":"2023-11-24T10:56:09Z","published":"2023-11-24T10:56:09Z","title":"BHGNN-RT: Network embedding for directed heterogeneous graphs","summary":" Networks are one of the most valuable data structures for modeling problems\nin the real world. However, the most recent node embedding strategies have\nfocused on undirected graphs, with limited attention to directed graphs,\nespecially directed heterogeneous graphs. In this study, we first investigated\nthe network properties of directed heterogeneous graphs. Based on network\nanalysis, we proposed an embedding method, a bidirectional heterogeneous graph\nneural network with random teleport (BHGNN-RT), for directed heterogeneous\ngraphs, that leverages bidirectional message-passing process and network\nheterogeneity. With the optimization of teleport proportion, BHGNN-RT is\nbeneficial to overcome the over-smoothing problem. Extensive experiments on\nvarious datasets were conducted to verify the efficacy and efficiency of\nBHGNN-RT. Furthermore, we investigated the effects of message components, model\nlayer, and teleport proportion on model performance. The performance comparison\nwith all other baselines illustrates that BHGNN-RT achieves state-of-the-art\nperformance, outperforming the benchmark methods in both node classification\nand unsupervised clustering tasks.\n","authors":["Xiyang Sun","Fumiyasu Komaki"],"pdf_url":"https://arxiv.org/pdf/2311.14404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12670v2","updated":"2023-11-24T10:49:50Z","published":"2023-11-21T15:28:44Z","title":"Towards a more inductive world for drug repurposing approaches","summary":" Drug-target interaction (DTI) prediction is a challenging, albeit essential\ntask in drug repurposing. Learning on graph models have drawn special attention\nas they can significantly reduce drug repurposing costs and time commitment.\nHowever, many current approaches require high-demanding additional information\nbesides DTIs that complicates their evaluation process and usability.\nAdditionally, structural differences in the learning architecture of current\nmodels hinder their fair benchmarking. In this work, we first perform an\nin-depth evaluation of current DTI datasets and prediction models through a\nrobust benchmarking process, and show that DTI prediction methods based on\ntransductive models lack generalization and lead to inflated performance when\nevaluated as previously done in the literature, hence not being suited for drug\nrepurposing approaches. We then propose a novel biologically-driven strategy\nfor negative edge subsampling and show through in vitro validation that newly\ndiscovered interactions are indeed true. We envision this work as the\nunderpinning for future fair benchmarking and robust model design. All\ngenerated resources and tools are publicly available as a python package.\n","authors":["Jesus de la Fuente","Guillermo Serrano","Uxía Veleiro","Mikel Casals","Laura Vera","Marija Pizurica","Antonio Pineda-Lucena","Idoia Ochoa","Silve Vicent","Olivier Gevaert","Mikel Hernaez"],"pdf_url":"https://arxiv.org/pdf/2311.12670v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14402v1","updated":"2023-11-24T10:49:49Z","published":"2023-11-24T10:49:49Z","title":"TEA: Test-time Energy Adaptation","summary":" Test-time adaptation (TTA) aims to improve model generalizability when test\ndata diverges from training distribution, offering the distinct advantage of\nnot requiring access to training data and processes, especially valuable in the\ncontext of large pre-trained models. However, current TTA methods fail to\naddress the fundamental issue: covariate shift, i.e., the decreased\ngeneralizability can be attributed to the model's reliance on the marginal\ndistribution of the training data, which may impair model calibration and\nintroduce confirmation bias. To address this, we propose a novel energy-based\nperspective, enhancing the model's perception of target data distributions\nwithout requiring access to training data or processes. Building on this\nperspective, we introduce $\\textbf{T}$est-time $\\textbf{E}$nergy\n$\\textbf{A}$daptation ($\\textbf{TEA}$), which transforms the trained classifier\ninto an energy-based model and aligns the model's distribution with the test\ndata's, enhancing its ability to perceive test distributions and thus improving\noverall generalizability. Extensive experiments across multiple tasks,\nbenchmarks and architectures demonstrate TEA's superior generalization\nperformance against state-of-the-art methods. Further in-depth analyses reveal\nthat TEA can equip the model with a comprehensive perception of test\ndistribution, ultimately paving the way toward improved generalization and\ncalibration.\n","authors":["Yige Yuan","Bingbing Xu","Liang Hou","Fei Sun","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14402v1.pdf","comment":"16 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.14395v1","updated":"2023-11-24T10:23:57Z","published":"2023-11-24T10:23:57Z","title":"Multi-scale Semantic Correlation Mining for Visible-Infrared Person\n Re-Identification","summary":" The main challenge in the Visible-Infrared Person Re-Identification (VI-ReID)\ntask lies in how to extract discriminative features from different modalities\nfor matching purposes. While the existing well works primarily focus on\nminimizing the modal discrepancies, the modality information can not thoroughly\nbe leveraged. To solve this problem, a Multi-scale Semantic Correlation Mining\nnetwork (MSCMNet) is proposed to comprehensively exploit semantic features at\nmultiple scales and simultaneously reduce modality information loss as small as\npossible in feature extraction. The proposed network contains three novel\ncomponents. Firstly, after taking into account the effective utilization of\nmodality information, the Multi-scale Information Correlation Mining Block\n(MIMB) is designed to explore semantic correlations across multiple scales.\nSecondly, in order to enrich the semantic information that MIMB can utilize, a\nquadruple-stream feature extractor (QFE) with non-shared parameters is\nspecifically designed to extract information from different dimensions of the\ndataset. Finally, the Quadruple Center Triplet Loss (QCT) is further proposed\nto address the information discrepancy in the comprehensive features. Extensive\nexperiments on the SYSU-MM01, RegDB, and LLCM datasets demonstrate that the\nproposed MSCMNet achieves the greatest accuracy.\n","authors":["Ke Cheng","Xuecheng Hua","Hu Lu","Juanjuan Tu","Yuanquan Wang","Shitong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14395v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14390v1","updated":"2023-11-24T10:14:05Z","published":"2023-11-24T10:14:05Z","title":"Directly Attention Loss Adjusted Prioritized Experience Replay","summary":" Prioritized Experience Replay (PER) enables the model to learn more about\nrelatively important samples by artificially changing their accessed\nfrequencies. However, this non-uniform sampling method shifts the state-action\ndistribution that is originally used to estimate Q-value functions, which\nbrings about the estimation deviation. In this article, an novel off policy\nreinforcement learning training framework called Directly Attention Loss\nAdjusted Prioritized Experience Replay (DALAP) is proposed, which can directly\nquantify the changed extent of the shifted distribution through Parallel\nSelf-Attention network, so as to accurately compensate the error. In addition,\na Priority-Encouragement mechanism is designed simultaneously to optimize the\nsample screening criterion, and further improve the training efficiency. In\norder to verify the effectiveness and generality of DALAP, we integrate it with\nthe value-function based, the policy-gradient based and multi-agent\nreinforcement learning algorithm, respectively. The multiple groups of\ncomparative experiments show that DALAP has the significant advantages of both\nimproving the convergence rate and reducing the training variance.\n","authors":["Zhuoying Chen","Huiping Li","Zhaoxu Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00692v2","updated":"2023-11-24T10:10:49Z","published":"2023-10-01T14:58:20Z","title":"The Noise Geometry of Stochastic Gradient Descent: A Quantitative and\n Analytical Characterization","summary":" Empirical studies have demonstrated that the noise in stochastic gradient\ndescent (SGD) aligns favorably with the local geometry of loss landscape.\nHowever, theoretical and quantitative explanations for this phenomenon remain\nsparse. In this paper, we offer a comprehensive theoretical investigation into\nthe aforementioned {\\em noise geometry} for over-parameterized linear (OLMs)\nmodels and two-layer neural networks. We scrutinize both average and\ndirectional alignments, paying special attention to how factors like sample\nsize and input data degeneracy affect the alignment strength. As a specific\napplication, we leverage our noise geometry characterizations to study how SGD\nescapes from sharp minima, revealing that the escape direction has significant\ncomponents along flat directions. This is in stark contrast to GD, which\nescapes only along the sharpest directions. To substantiate our theoretical\nfindings, both synthetic and real-world experiments are provided.\n","authors":["Mingze Wang","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2310.00692v2.pdf","comment":"31 pages"},{"id":"http://arxiv.org/abs/2311.14388v1","updated":"2023-11-24T10:07:14Z","published":"2023-11-24T10:07:14Z","title":"A Parameterized Generative Adversarial Network Using Cyclic Projection\n for Explainable Medical Image Classification","summary":" Although current data augmentation methods are successful to alleviate the\ndata insufficiency, conventional augmentation are primarily intra-domain while\nadvanced generative adversarial networks (GANs) generate images remaining\nuncertain, particularly in small-scale datasets. In this paper, we propose a\nparameterized GAN (ParaGAN) that effectively controls the changes of synthetic\nsamples among domains and highlights the attention regions for downstream\nclassification. Specifically, ParaGAN incorporates projection distance\nparameters in cyclic projection and projects the source images to the decision\nboundary to obtain the class-difference maps. Our experiments show that ParaGAN\ncan consistently outperform the existing augmentation methods with explainable\nclassification on two small-scale medical datasets.\n","authors":["Xiangyu Xiong","Yue Sun","Xiaohong Liu","ChanTong Lam","Tong Tong","Hao Chen","Qinquan Gao","Wei Ke","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2311.14388v1.pdf","comment":"5 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.14387v1","updated":"2023-11-24T10:07:10Z","published":"2023-11-24T10:07:10Z","title":"Achieving Margin Maximization Exponentially Fast via Progressive Norm\n Rescaling","summary":" In this work, we investigate the margin-maximization bias exhibited by\ngradient-based algorithms in classifying linearly separable data. We present an\nin-depth analysis of the specific properties of the velocity field associated\nwith (normalized) gradients, focusing on their role in margin maximization.\nInspired by this analysis, we propose a novel algorithm called Progressive\nRescaling Gradient Descent (PRGD) and show that PRGD can maximize the margin at\nan {\\em exponential rate}. This stands in stark contrast to all existing\nalgorithms, which maximize the margin at a slow {\\em polynomial rate}.\nSpecifically, we identify mild conditions on data distribution under which\nexisting algorithms such as gradient descent (GD) and normalized gradient\ndescent (NGD) {\\em provably fail} in maximizing the margin efficiently. To\nvalidate our theoretical findings, we present both synthetic and real-world\nexperiments. Notably, PRGD also shows promise in enhancing the generalization\nperformance when applied to linearly non-separable datasets and deep neural\nnetworks.\n","authors":["Mingze Wang","Zeping Min","Lei Wu"],"pdf_url":"https://arxiv.org/pdf/2311.14387v1.pdf","comment":"39 pages"},{"id":"http://arxiv.org/abs/2306.11380v4","updated":"2023-11-24T09:52:49Z","published":"2023-06-20T08:38:31Z","title":"A Bayesian Take on Gaussian Process Networks","summary":" Gaussian Process Networks (GPNs) are a class of directed graphical models\nwhich employ Gaussian processes as priors for the conditional expectation of\neach variable given its parents in the network. The model allows the\ndescription of continuous joint distributions in a compact but flexible manner\nwith minimal parametric assumptions on the dependencies between variables.\nBayesian structure learning of GPNs requires computing the posterior over\ngraphs of the network and is computationally infeasible even in low dimensions.\nThis work implements Monte Carlo and Markov Chain Monte Carlo methods to sample\nfrom the posterior distribution of network structures. As such, the approach\nfollows the Bayesian paradigm, comparing models via their marginal likelihood\nand computing the posterior probability of the GPN features. Simulation studies\nshow that our method outperforms state-of-the-art algorithms in recovering the\ngraphical structure of the network and provides an accurate approximation of\nits posterior distribution.\n","authors":["Enrico Giudice","Jack Kuipers","Giusi Moffa"],"pdf_url":"https://arxiv.org/pdf/2306.11380v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00557v2","updated":"2023-11-24T09:44:49Z","published":"2023-04-30T19:45:04Z","title":"Collective Relational Inference for learning heterogeneous interactions","summary":" Interacting systems are ubiquitous in nature and engineering, ranging from\nparticle dynamics in physics to functionally connected brain regions. These\ninteracting systems can be modeled by graphs where edges correspond to the\ninteractions between interactive entities. Revealing interaction laws is of\nfundamental importance but also particularly challenging due to underlying\nconfigurational complexities. The associated challenges become exacerbated for\nheterogeneous systems that are prevalent in reality, where multiple interaction\ntypes coexist simultaneously and relational inference is required. Here, we\npropose a novel probabilistic method for relational inference, which possesses\ntwo distinctive characteristics compared to existing methods. First, it infers\nthe interaction types of different edges collectively, and second, it allows\nhandling systems with variable topological structure over time. We evaluate the\nproposed methodology across several benchmark datasets and demonstrate that it\noutperforms existing methods in accurately inferring interaction types. We\nfurther show that when combined with known constraints, it allows us, for\nexample, to discover physics-consistent interaction laws of particle systems.\nOverall the proposed model is data-efficient and generalizable to large systems\nwhen trained on smaller ones. The developed methodology constitutes a key\nelement for understanding interacting systems and may find application in graph\nstructure learning.\n","authors":["Zhichao Han","Olga Fink","David S. Kammer"],"pdf_url":"https://arxiv.org/pdf/2305.00557v2.pdf","comment":"Under review. Links to the supporting code can be found at the end of\n the main content"},{"id":"http://arxiv.org/abs/2301.13636v2","updated":"2023-11-24T09:43:24Z","published":"2023-01-31T13:50:16Z","title":"Transport with Support: Data-Conditional Diffusion Bridges","summary":" The dynamic Schr\\\"odinger bridge problem provides an appealing setting for\nsolving constrained time-series data generation tasks posed as optimal\ntransport problems. It consists of learning non-linear diffusion processes\nusing efficient iterative solvers. Recent works have demonstrated\nstate-of-the-art results (eg. in modelling single-cell embryo RNA sequences or\nsampling from complex posteriors) but are limited to learning bridges with only\ninitial and terminal constraints. Our work extends this paradigm by proposing\nthe Iterative Smoothing Bridge (ISB). We integrate Bayesian filtering and\noptimal control into learning the diffusion process, enabling the generation of\nconstrained stochastic processes governed by sparse observations at\nintermediate stages and terminal constraints. We assess the effectiveness of\nour method on synthetic and real-world data generation tasks and we show that\nthe ISB generalises well to high-dimensional data, is computationally\nefficient, and provides accurate estimates of the marginals at intermediate and\nterminal times.\n","authors":["Ella Tamir","Martin Trapp","Arno Solin"],"pdf_url":"https://arxiv.org/pdf/2301.13636v2.pdf","comment":"27 pages, 11 figures"},{"id":"http://arxiv.org/abs/2310.13102v2","updated":"2023-11-24T09:42:21Z","published":"2023-10-19T19:01:00Z","title":"Particle Guidance: non-I.I.D. Diverse Sampling with Diffusion Models","summary":" In light of the widespread success of generative models, a significant amount\nof research has gone into speeding up their sampling time. However, generative\nmodels are often sampled multiple times to obtain a diverse set incurring a\ncost that is orthogonal to sampling time. We tackle the question of how to\nimprove diversity and sample efficiency by moving beyond the common assumption\nof independent samples. We propose particle guidance, an extension of\ndiffusion-based generative sampling where a joint-particle time-evolving\npotential enforces diversity. We analyze theoretically the joint distribution\nthat particle guidance generates, how to learn a potential that achieves\noptimal diversity, and the connections with methods in other disciplines.\nEmpirically, we test the framework both in the setting of conditional image\ngeneration, where we are able to increase diversity without affecting quality,\nand molecular conformer generation, where we reduce the state-of-the-art median\nerror by 13% on average.\n","authors":["Gabriele Corso","Yilun Xu","Valentin de Bortoli","Regina Barzilay","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2310.13102v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14371v1","updated":"2023-11-24T09:33:33Z","published":"2023-11-24T09:33:33Z","title":"Federated Transformed Learning for a Circular, Secure, and Tiny AI","summary":" Deep Learning (DL) is penetrating into a diverse range of mass mobility,\nsmart living, and industrial applications, rapidly transforming the way we live\nand work. DL is at the heart of many AI implementations. A key set of\nchallenges is to produce AI modules that are: (1) \"circular\" - can solve new\ntasks without forgetting how to solve previous ones, (2) \"secure\" - have\nimmunity to adversarial data attacks, and (3) \"tiny\" - implementable in low\npower low cost embedded hardware. Clearly it is difficult to achieve all three\naspects on a single horizontal layer of platforms, as the techniques require\ntransformed deep representations that incur different computation and\ncommunication requirements. Here we set out the vision to achieve transformed\nDL representations across a 5G and Beyond networked architecture. We first\ndetail the cross-sectoral motivations for each challenge area, before\ndemonstrating recent advances in DL research that can achieve circular, secure,\nand tiny AI (CST-AI). Recognising the conflicting demand of each transformed\ndeep representation, we federate their deep learning transformations and\nfunctionalities across the network to achieve connected run-time capabilities.\n","authors":["Weisi Guo","Schyler Sun","Bin Li","Sam Blakeman"],"pdf_url":"https://arxiv.org/pdf/2311.14371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13110v2","updated":"2023-11-24T09:18:44Z","published":"2023-11-22T02:23:32Z","title":"White-Box Transformers via Sparse Rate Reduction: Compression Is All\n There Is?","summary":" In this paper, we contend that a natural objective of representation learning\nis to compress and transform the distribution of the data, say sets of tokens,\ntowards a low-dimensional Gaussian mixture supported on incoherent subspaces.\nThe goodness of such a representation can be evaluated by a principled measure,\ncalled sparse rate reduction, that simultaneously maximizes the intrinsic\ninformation gain and extrinsic sparsity of the learned representation. From\nthis perspective, popular deep network architectures, including transformers,\ncan be viewed as realizing iterative schemes to optimize this measure.\nParticularly, we derive a transformer block from alternating optimization on\nparts of this objective: the multi-head self-attention operator compresses the\nrepresentation by implementing an approximate gradient descent step on the\ncoding rate of the features, and the subsequent multi-layer perceptron\nsparsifies the features. This leads to a family of white-box transformer-like\ndeep network architectures, named CRATE, which are mathematically fully\ninterpretable. We show, by way of a novel connection between denoising and\ncompression, that the inverse to the aforementioned compressive encoding can be\nrealized by the same class of CRATE architectures. Thus, the so-derived\nwhite-box architectures are universal to both encoders and decoders.\nExperiments show that these networks, despite their simplicity, indeed learn to\ncompress and sparsify representations of large-scale real-world image and text\ndatasets, and achieve performance very close to highly engineered\ntransformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the\nproposed computational framework demonstrates great potential in bridging the\ngap between theory and practice of deep learning, from a unified perspective of\ndata compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE .\n","authors":["Yaodong Yu","Sam Buchanan","Druv Pai","Tianzhe Chu","Ziyang Wu","Shengbang Tong","Hao Bai","Yuexiang Zhai","Benjamin D. Haeffele","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2311.13110v2.pdf","comment":"This paper integrates the works arXiv:2306.01129 and arXiv:2308.16271\n into a complete story. In this paper, we improve the writing and\n organization, and also add conceptual, empirical, and theoretical\n improvements over the previous work. V2: small typo fixes and formatting\n improvements"},{"id":"http://arxiv.org/abs/2308.09687v3","updated":"2023-11-24T09:13:54Z","published":"2023-08-18T17:29:23Z","title":"Graph of Thoughts: Solving Elaborate Problems with Large Language Models","summary":" We introduce Graph of Thoughts (GoT): a framework that advances prompting\ncapabilities in large language models (LLMs) beyond those offered by paradigms\nsuch as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary\nadvantage of GoT is the ability to model the information generated by an LLM as\nan arbitrary graph, where units of information (\"LLM thoughts\") are vertices,\nand edges correspond to dependencies between these vertices. This approach\nenables combining arbitrary LLM thoughts into synergistic outcomes, distilling\nthe essence of whole networks of thoughts, or enhancing thoughts using feedback\nloops. We illustrate that GoT offers advantages over state of the art on\ndifferent tasks, for example increasing the quality of sorting by 62% over ToT,\nwhile simultaneously reducing costs by >31%. We ensure that GoT is extensible\nwith new thought transformations and thus can be used to spearhead new\nprompting schemes. This work brings the LLM reasoning closer to human thinking\nor brain mechanisms such as recurrence, both of which form complex networks.\n","authors":["Maciej Besta","Nils Blach","Ales Kubicek","Robert Gerstenberger","Lukas Gianinazzi","Joanna Gajda","Tomasz Lehmann","Michal Podstawski","Hubert Niewiadomski","Piotr Nyczyk","Torsten Hoefler"],"pdf_url":"https://arxiv.org/pdf/2308.09687v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14361v1","updated":"2023-11-24T09:03:52Z","published":"2023-11-24T09:03:52Z","title":"Deciphering and integrating invariants for neural operator learning with\n various physical mechanisms","summary":" Neural operators have been explored as surrogate models for simulating\nphysical systems to overcome the limitations of traditional partial\ndifferential equation (PDE) solvers. However, most existing operator learning\nmethods assume that the data originate from a single physical mechanism,\nlimiting their applicability and performance in more realistic scenarios. To\nthis end, we propose Physical Invariant Attention Neural Operator (PIANO) to\ndecipher and integrate the physical invariants (PI) for operator learning from\nthe PDE series with various physical mechanisms. PIANO employs self-supervised\nlearning to extract physical knowledge and attention mechanisms to integrate\nthem into dynamic convolutional layers. Compared to existing techniques, PIANO\ncan reduce the relative error by 13.6\\%-82.2\\% on PDE forecasting tasks across\nvarying coefficients, forces, or boundary conditions. Additionally, varied\ndownstream tasks reveal that the PI embeddings deciphered by PIANO align well\nwith the underlying invariants in the PDE systems, verifying the physical\nsignificance of PIANO. The source code will be publicly available at:\nhttps://github.com/optray/PIANO.\n","authors":["Rui Zhang","Qi Meng","Zhi-Ming Ma"],"pdf_url":"https://arxiv.org/pdf/2311.14361v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14359v1","updated":"2023-11-24T09:02:24Z","published":"2023-11-24T09:02:24Z","title":"Thompson sampling for zero-inflated count outcomes with an application\n to the Drink Less mobile health study","summary":" Mobile health (mHealth) technologies aim to improve distal outcomes, such as\nclinical conditions, by optimizing proximal outcomes through just-in-time\nadaptive interventions. Contextual bandits provide a suitable framework for\ncustomizing such interventions according to individual time-varying contexts,\nintending to maximize cumulative proximal outcomes. However, unique challenges\nsuch as modeling count outcomes within bandit frameworks have hindered the\nwidespread application of contextual bandits to mHealth studies. The current\nwork addresses this challenge by leveraging count data models into online\ndecision-making approaches. Specifically, we combine four common offline count\ndata models (Poisson, negative binomial, zero-inflated Poisson, and\nzero-inflated negative binomial regressions) with Thompson sampling, a popular\ncontextual bandit algorithm. The proposed algorithms are motivated by and\nevaluated on a real dataset from the Drink Less trial, where they are shown to\nimprove user engagement with the mHealth system. The proposed methods are\nfurther evaluated on simulated data, achieving improvement in maximizing\ncumulative proximal outcomes over existing algorithms. Theoretical results on\nregret bounds are also derived. A user-friendly R package countts that\nimplements the proposed methods for assessing contextual bandit algorithms is\nmade publicly available at https://cran.r-project.org/web/packages/countts.\n","authors":["Xueqing Liu","Nina Deliu","Tanujit Chakraborty","Lauren Bell","Bibhas Chakraborty"],"pdf_url":"https://arxiv.org/pdf/2311.14359v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.14415v4","updated":"2023-11-24T09:01:12Z","published":"2022-05-28T12:27:27Z","title":"Non-stationary Transformers: Exploring the Stationarity in Time Series\n Forecasting","summary":" Transformers have shown great power in time series forecasting due to their\nglobal-range modeling ability. However, their performance can degenerate\nterribly on non-stationary real-world data in which the joint distribution\nchanges over time. Previous studies primarily adopt stationarization to\nattenuate the non-stationarity of original series for better predictability.\nBut the stationarized series deprived of inherent non-stationarity can be less\ninstructive for real-world bursty events forecasting. This problem, termed\nover-stationarization in this paper, leads Transformers to generate\nindistinguishable temporal attentions for different series and impedes the\npredictive capability of deep models. To tackle the dilemma between series\npredictability and model capability, we propose Non-stationary Transformers as\na generic framework with two interdependent modules: Series Stationarization\nand De-stationary Attention. Concretely, Series Stationarization unifies the\nstatistics of each input and converts the output with restored statistics for\nbetter predictability. To address the over-stationarization problem,\nDe-stationary Attention is devised to recover the intrinsic non-stationary\ninformation into temporal dependencies by approximating distinguishable\nattentions learned from raw series. Our Non-stationary Transformers framework\nconsistently boosts mainstream Transformers by a large margin, which reduces\nMSE by 49.43% on Transformer, 47.34% on Informer, and 46.89% on Reformer,\nmaking them the state-of-the-art in time series forecasting. Code is available\nat this repository: https://github.com/thuml/Nonstationary_Transformers.\n","authors":["Yong Liu","Haixu Wu","Jianmin Wang","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2205.14415v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08745v2","updated":"2023-11-24T08:49:11Z","published":"2023-11-15T07:27:40Z","title":"Using Stochastic Gradient Descent to Smooth Nonconvex Functions:\n Analysis of Implicit Graduated Optimization with Optimal Noise Scheduling","summary":" The graduated optimization approach is a heuristic method for finding\nglobally optimal solutions for nonconvex functions and has been theoretically\nanalyzed in several studies. This paper defines a new family of nonconvex\nfunctions for graduated optimization, discusses their sufficient conditions,\nand provides a convergence analysis of the graduated optimization algorithm for\nthem. It shows that stochastic gradient descent (SGD) with mini-batch\nstochastic gradients has the effect of smoothing the function, the degree of\nwhich is determined by the learning rate and batch size. This finding provides\ntheoretical insights on why large batch sizes fall into sharp local minima, why\ndecaying learning rates and increasing batch sizes are superior to fixed\nlearning rates and batch sizes, and what the optimal learning rate scheduling\nis. To the best of our knowledge, this is the first paper to provide a\ntheoretical explanation for these aspects. Moreover, a new graduated\noptimization framework that uses a decaying learning rate and increasing batch\nsize is analyzed and experimental results of image classification that support\nour theoretical findings are reported.\n","authors":["Naoki Sato","Hideaki Iiduka"],"pdf_url":"https://arxiv.org/pdf/2311.08745v2.pdf","comment":"The latest version was updated on Nov. 24"},{"id":"http://arxiv.org/abs/2310.02970v2","updated":"2023-11-24T08:48:32Z","published":"2023-10-04T17:06:32Z","title":"Fast, Expressive SE$(n)$ Equivariant Networks through Weight-Sharing in\n Position-Orientation Space","summary":" Based on the theory of homogeneous spaces we derive \\textit{geometrically\noptimal edge attributes} to be used within the flexible message passing\nframework. We formalize the notion of weight sharing in convolutional networks\nas the sharing of message functions over point-pairs that should be treated\nequally. We define equivalence classes of point-pairs that are identical up to\na transformation in the group and derive attributes that uniquely identify\nthese classes. Weight sharing is then obtained by conditioning message\nfunctions on these attributes. As an application of the theory, we develop an\nefficient equivariant group convolutional network for processing 3D point\nclouds. The theory of homogeneous spaces tells us how to do group convolutions\nwith feature maps over the homogeneous space of positions $\\mathbb{R}^3$,\nposition and orientations $\\mathbb{R}^3 {\\times} S^2$, and the group SE$(3)$\nitself. Among these, $\\mathbb{R}^3 {\\times} S^2$ is an optimal choice due to\nthe ability to represent directional information, which $\\mathbb{R}^3$ methods\ncannot, and it significantly enhances computational efficiency compared to\nindexing features on the full SE$(3)$ group. We empirically support this claim\nby reaching state-of-the-art results -- in accuracy and speed -- on three\ndifferent benchmarks: interatomic potential energy prediction, trajectory\nforecasting in N-body systems, and generating molecules via equivariant\ndiffusion models.\n","authors":["Erik J Bekkers","Sharvaree Vadgama","Rob D Hesselink","Putri A van der Linden","David W Romero"],"pdf_url":"https://arxiv.org/pdf/2310.02970v2.pdf","comment":"Our code is publicly available at https://github.com/ebekkers/ponita"},{"id":"http://arxiv.org/abs/2311.14335v1","updated":"2023-11-24T08:16:39Z","published":"2023-11-24T08:16:39Z","title":"Comparative Analysis of Transformers for Modeling Tabular Data: A\n Casestudy using Industry Scale Dataset","summary":" We perform a comparative analysis of transformer-based models designed for\nmodeling tabular data, specifically on an industry-scale dataset. While earlier\nstudies demonstrated promising outcomes on smaller public or synthetic\ndatasets, the effectiveness did not extend to larger industry-scale datasets.\nThe challenges identified include handling high-dimensional data, the necessity\nfor efficient pre-processing of categorical and numerical features, and\naddressing substantial computational requirements.\n To overcome the identified challenges, the study conducts an extensive\nexamination of various transformer-based models using both synthetic datasets\nand the default prediction Kaggle dataset (2022) from American Express. The\npaper presents crucial insights into optimal data pre-processing, compares\npre-training and direct supervised learning methods, discusses strategies for\nmanaging categorical and numerical features, and highlights trade-offs between\ncomputational resources and performance. Focusing on temporal financial data\nmodeling, the research aims to facilitate the systematic development and\ndeployment of transformer-based models in real-world scenarios, emphasizing\nscalability.\n","authors":["Usneek Singh","Piyush Arora","Shamika Ganesan","Mohit Kumar","Siddhant Kulkarni","Salil R. Joshi"],"pdf_url":"https://arxiv.org/pdf/2311.14335v1.pdf","comment":"Accepted at 7th Joint International Conference on Data Science &\n Management of Data (11th ACMIKDD CODS and 29th COMAD)"},{"id":"http://arxiv.org/abs/2311.14333v1","updated":"2023-11-24T08:15:54Z","published":"2023-11-24T08:15:54Z","title":"Cycle Invariant Positional Encoding for Graph Representation Learning","summary":" Cycles are fundamental elements in graph-structured data and have\ndemonstrated their effectiveness in enhancing graph learning models. To encode\nsuch information into a graph learning framework, prior works often extract a\nsummary quantity, ranging from the number of cycles to the more sophisticated\npersistence diagram summaries. However, more detailed information, such as\nwhich edges are encoded in a cycle, has not yet been used in graph neural\nnetworks. In this paper, we make one step towards addressing this gap, and\npropose a structure encoding module, called CycleNet, that encodes cycle\ninformation via edge structure encoding in a permutation invariant manner. To\nefficiently encode the space of all cycles, we start with a cycle basis (i.e.,\na minimal set of cycles generating the cycle space) which we compute via the\nkernel of the 1-dimensional Hodge Laplacian of the input graph. To guarantee\nthe encoding is invariant w.r.t. the choice of cycle basis, we encode the cycle\ninformation via the orthogonal projector of the cycle basis, which is inspired\nby BasisNet proposed by Lim et al. We also develop a more efficient variant\nwhich however requires that the input graph has a unique shortest cycle basis.\nTo demonstrate the effectiveness of the proposed module, we provide some\ntheoretical understandings of its expressive power. Moreover, we show via a\nrange of experiments that networks enhanced by our CycleNet module perform\nbetter in various benchmarks compared to several existing SOTA models.\n","authors":["Zuoyu Yan","Tengfei Ma","Liangcai Gao","Zhi Tang","Chao Chen","Yusu Wang"],"pdf_url":"https://arxiv.org/pdf/2311.14333v1.pdf","comment":"Accepted as oral presentation in the Learning on Graphs Conference\n (LoG 2023)"},{"id":"http://arxiv.org/abs/2311.14332v1","updated":"2023-11-24T08:15:11Z","published":"2023-11-24T08:15:11Z","title":"GATGPT: A Pre-trained Large Language Model with Graph Attention Network\n for Spatiotemporal Imputation","summary":" The analysis of spatiotemporal data is increasingly utilized across diverse\ndomains, including transportation, healthcare, and meteorology. In real-world\nsettings, such data often contain missing elements due to issues like sensor\nmalfunctions and data transmission errors. The objective of spatiotemporal\nimputation is to estimate these missing values by understanding the inherent\nspatial and temporal relationships in the observed multivariate time series.\nTraditionally, spatiotemporal imputation has relied on specific, intricate\narchitectures designed for this purpose, which suffer from limited\napplicability and high computational complexity. In contrast, our approach\nintegrates pre-trained large language models (LLMs) into spatiotemporal\nimputation, introducing a groundbreaking framework, GATGPT. This framework\nmerges a graph attention mechanism with LLMs. We maintain most of the LLM\nparameters unchanged to leverage existing knowledge for learning temporal\npatterns, while fine-tuning the upper layers tailored to various applications.\nThe graph attention component enhances the LLM's ability to understand spatial\nrelationships. Through tests on three distinct real-world datasets, our\ninnovative approach demonstrates comparable results to established deep\nlearning benchmarks.\n","authors":["Yakun Chen","Xianzhi Wang","Guandong Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14332v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05052v3","updated":"2023-11-24T08:00:48Z","published":"2023-10-08T07:25:27Z","title":"Accurate battery lifetime prediction across diverse aging conditions\n with deep learning","summary":" Accurately predicting the lifetime of battery cells in early cycles holds\ntremendous value for battery research and development as well as numerous\ndownstream applications. This task is rather challenging because diverse\nconditions, such as electrode materials, operating conditions, and working\nenvironments, collectively determine complex capacity-degradation behaviors.\nHowever, current prediction methods are developed and validated under limited\naging conditions, resulting in questionable adaptability to varied aging\nconditions and an inability to fully benefit from historical data collected\nunder different conditions. Here we introduce a universal deep learning\napproach that is capable of accommodating various aging conditions and\nfacilitating effective learning under low-resource conditions by leveraging\ndata from rich conditions. Our key finding is that incorporating inter-cell\nfeature differences, rather than solely considering single-cell\ncharacteristics, significantly increases the accuracy of battery lifetime\nprediction and its cross-condition robustness. Accordingly, we develop a\nholistic learning framework accommodating both single-cell and inter-cell\nmodeling. A comprehensive benchmark is built for evaluation, encompassing 401\nbattery cells utilizing 5 prevalent electrode materials across 168 cycling\nconditions. We demonstrate remarkable capabilities in learning across diverse\naging conditions, exclusively achieving 10% prediction error using the first\n100 cycles, and in facilitating low-resource learning, almost halving the error\nof single-cell modeling in many cases. More broadly, by breaking the learning\nboundaries among different aging conditions, our approach could significantly\naccelerate the development and optimization of lithium-ion batteries.\n","authors":["Han Zhang","Yuqi Li","Shun Zheng","Ziheng Lu","Xiaofan Gui","Wei Xu","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.05052v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14324v1","updated":"2023-11-24T07:53:48Z","published":"2023-11-24T07:53:48Z","title":"Large Language Models as Topological Structure Enhancers for\n Text-Attributed Graphs","summary":" The latest advancements in large language models (LLMs) have revolutionized\nthe field of natural language processing (NLP). Inspired by the success of LLMs\nin NLP tasks, some recent work has begun investigating the potential of\napplying LLMs in graph learning tasks. However, most of the existing work\nfocuses on utilizing LLMs as powerful node feature augmenters, leaving\nemploying LLMs to enhance graph topological structures an understudied problem.\nIn this work, we explore how to leverage the information retrieval and text\ngeneration capabilities of LLMs to refine/enhance the topological structure of\ntext-attributed graphs (TAGs) under the node classification setting. First, we\npropose using LLMs to help remove unreliable edges and add reliable ones in the\nTAG. Specifically, we first let the LLM output the semantic similarity between\nnode attributes through delicate prompt designs, and then perform edge deletion\nand edge addition based on the similarity. Second, we propose using\npseudo-labels generated by the LLM to improve graph topology, that is, we\nintroduce the pseudo-label propagation as a regularization to guide the graph\nneural network (GNN) in learning proper edge weights. Finally, we incorporate\nthe two aforementioned LLM-based methods for graph topological refinement into\nthe process of GNN training, and perform extensive experiments on four\nreal-world datasets. The experimental results demonstrate the effectiveness of\nLLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain\non public benchmarks).\n","authors":["Shengyin Sun","Yuxiang Ren","Chen Ma","Xuecang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.14324v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.12612v2","updated":"2023-11-24T07:42:45Z","published":"2023-11-21T13:54:08Z","title":"A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of\n Continuous Random Variables","summary":" In this paper, I present a completely new type of upper and lower bounds on\nthe right-tail probabilities of continuous random variables with unbounded\nsupport and with semi-bounded support from the left. The presented upper and\nlower right-tail bounds depend only on the probability density function (PDF),\nits first derivative, and two parameters that are used for tightening the\nbounds. These tail bounds hold under certain conditions that depend on the PDF,\nits first and second derivatives, and the two parameters. The new tail bounds\nare shown to be tight for a wide range of continuous random variables via\nnumerical examples.\n","authors":["Nikola Zlatanov"],"pdf_url":"https://arxiv.org/pdf/2311.12612v2.pdf","comment":"Minor typos corrected"},{"id":"http://arxiv.org/abs/2310.04483v2","updated":"2023-11-24T07:26:10Z","published":"2023-10-06T12:33:32Z","title":"Reward Dropout Improves Control: Bi-objective Perspective on Reinforced\n LM","summary":" We study the theoretical aspects of Reinforced Language Models (RLMs) from a\nbi-objective optimization perspective. Specifically, we consider the RLMs as a\nPareto optimization problem that maximizes the two conflicting objectives,\ni.e., reward objective and likelihood objectives, simultaneously. Our main\ncontribution consists of three parts. First, we establish the theoretical\nfoundations of RLM as a Pareto optimization problem by presenting Reward Upper\nBOund (RUBO) and Pareto optimality. Our theoretical outcomes are supported by\nnot only deductive proofs but also empirical results. Second, we propose Reward\nDropout, a simple yet powerful method that guarantees to improve a bi-objective\noptimization of RLM. Lastly, we demonstrate that the Reward Dropout is\nconsistently effective across five benchmark datasets and four benchmark LLMs,\nmeaning that the Reward Dropout significantly improves the optimization\nperformance of RLMs.\n","authors":["Changhun Lee","Chiehyeon Lim"],"pdf_url":"https://arxiv.org/pdf/2310.04483v2.pdf","comment":"29 pages, 13 figures, conference"},{"id":"http://arxiv.org/abs/2305.19190v3","updated":"2023-11-24T07:23:12Z","published":"2023-05-30T16:34:28Z","title":"Inverse Approximation Theory for Nonlinear Recurrent Neural Networks","summary":" We prove an inverse approximation theorem for the approximation of nonlinear\nsequence-to-sequence relationships using recurrent neural networks (RNNs). This\nis a so-called Bernstein-type result in approximation theory, which deduces\nproperties of a target function under the assumption that it can be effectively\napproximated by a hypothesis space. In particular, we show that nonlinear\nsequence relationships that can be stably approximated by nonlinear RNNs must\nhave an exponential decaying memory structure - a notion that can be made\nprecise. This extends the previously identified curse of memory in linear RNNs\ninto the general nonlinear setting, and quantifies the essential limitations of\nthe RNN architecture for learning sequential relationships with long-term\nmemory. Based on the analysis, we propose a principled reparameterization\nmethod to overcome the limitations. Our theoretical results are confirmed by\nnumerical experiments. The code has been released in\nhttps://github.com/radarFudan/Curse-of-memory\n","authors":["Shida Wang","Zhong Li","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2305.19190v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12070v3","updated":"2023-11-24T07:00:54Z","published":"2023-06-21T07:43:23Z","title":"Task-Robust Pre-Training for Worst-Case Downstream Adaptation","summary":" Pre-training has achieved remarkable success when transferred to downstream\ntasks. In machine learning, we care about not only the good performance of a\nmodel but also its behavior under reasonable shifts of condition. The same\nphilosophy holds when pre-training a foundation model. However, the foundation\nmodel may not uniformly behave well for a series of related downstream tasks.\nThis happens, for example, when conducting mask recovery regression where the\nrecovery ability or the training instances diverge like pattern features are\nextracted dominantly on pre-training, but semantic features are also required\non a downstream task. This paper considers pre-training a model that guarantees\na uniformly good performance over the downstream tasks. We call this goal as\n$\\textit{downstream-task robustness}$. Our method first separates the upstream\ntask into several representative ones and applies a simple minimax loss for\npre-training. We then design an efficient algorithm to solve the minimax loss\nand prove its convergence in the convex setting. In the experiments, we show\nboth on large-scale natural language processing and computer vision datasets\nour method increases the metrics on worse-case downstream tasks. Additionally,\nsome theoretical explanations for why our loss is beneficial are provided.\nSpecifically, we show fewer samples are inherently required for the most\nchallenging downstream task in some cases.\n","authors":["Jianghui Wang","Yang Chen","Xingyu Xie","Cong Fang","Zhouchen Lin"],"pdf_url":"https://arxiv.org/pdf/2306.12070v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12379v2","updated":"2023-11-24T06:59:57Z","published":"2023-11-21T06:41:41Z","title":"Infinite forecast combinations based on Dirichlet process","summary":" Forecast combination integrates information from various sources by\nconsolidating multiple forecast results from the target time series. Instead of\nthe need to select a single optimal forecasting model, this paper introduces a\ndeep learning ensemble forecasting model based on the Dirichlet process.\nInitially, the learning rate is sampled with three basis distributions as\nhyperparameters to convert the infinite mixture into a finite one. All\ncheckpoints are collected to establish a deep learning sub-model pool, and\nweight adjustment and diversity strategies are developed during the combination\nprocess. The main advantage of this method is its ability to generate the\nrequired base learners through a single training process, utilizing the\ndecaying strategy to tackle the challenge posed by the stochastic nature of\ngradient descent in determining the optimal learning rate. To ensure the\nmethod's generalizability and competitiveness, this paper conducts an empirical\nanalysis using the weekly dataset from the M4 competition and explores\nsensitivity to the number of models to be combined. The results demonstrate\nthat the ensemble model proposed offers substantial improvements in prediction\naccuracy and stability compared to a single benchmark model.\n","authors":["Yinuo Ren","Feng Li","Yanfei Kang","Jue Wang"],"pdf_url":"https://arxiv.org/pdf/2311.12379v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02818v3","updated":"2023-11-24T06:36:55Z","published":"2023-11-06T01:41:46Z","title":"Signal Processing Meets SGD: From Momentum to Filter","summary":" In the field of deep learning, Stochastic Gradient Descent (SGD) and its\nmomentum-based variants are the predominant choices for optimization\nalgorithms. Despite all that, these momentum strategies, which accumulate\nhistorical gradients by using a fixed $\\beta$ hyperparameter to smooth the\noptimization processing, often neglect the potential impact of the variance of\nhistorical gradients on the current gradient estimation. In the gradient\nvariance during training, fluctuation indicates the objective function does not\nmeet the Lipschitz continuity condition at all time, which raises the\ntroublesome optimization problem. This paper aims to explore the potential\nbenefits of reducing the variance of historical gradients to make optimizer\nconverge to flat solutions. Moreover, we proposed a new optimization method\nbased on reducing the variance. We employed the Wiener filter theory to enhance\nthe first moment estimation of SGD, notably introducing an adaptive weight to\noptimizer. Specifically, the adaptive weight dynamically changes along with\ntemporal fluctuation of gradient variance during deep learning model training.\nExperimental results demonstrated our proposed adaptive weight optimizer, SGDF\n(Stochastic Gradient Descent With Filter), can achieve satisfactory performance\ncompared with state-of-the-art optimizers.\n","authors":["Zhipeng Yao","Guisong Chang","Jiaqi Zhang","Qi Zhang","Yu Zhang","Dazhou Li"],"pdf_url":"https://arxiv.org/pdf/2311.02818v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2010.07468 by other authors"},{"id":"http://arxiv.org/abs/2311.14305v1","updated":"2023-11-24T06:29:04Z","published":"2023-11-24T06:29:04Z","title":"New Epochs in AI Supervision: Design and Implementation of an Autonomous\n Radiology AI Monitoring System","summary":" With the increasingly widespread adoption of AI in healthcare, maintaining\nthe accuracy and reliability of AI models in clinical practice has become\ncrucial. In this context, we introduce novel methods for monitoring the\nperformance of radiology AI classification models in practice, addressing the\nchallenges of obtaining real-time ground truth for performance monitoring. We\npropose two metrics - predictive divergence and temporal stability - to be used\nfor preemptive alerts of AI performance changes. Predictive divergence,\nmeasured using Kullback-Leibler and Jensen-Shannon divergences, evaluates model\naccuracy by comparing predictions with those of two supplementary models.\nTemporal stability is assessed through a comparison of current predictions\nagainst historical moving averages, identifying potential model decay or data\ndrift. This approach was retrospectively validated using chest X-ray data from\na single-center imaging clinic, demonstrating its effectiveness in maintaining\nAI model reliability. By providing continuous, real-time insights into model\nperformance, our system ensures the safe and effective use of AI in clinical\ndecision-making, paving the way for more robust AI integration in healthcare\n","authors":["Vasantha Kumar Venugopal","Abhishek Gupta","Rohit Takhar","Vidur Mahajan"],"pdf_url":"https://arxiv.org/pdf/2311.14305v1.pdf","comment":"10 pages, 4 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.14304v1","updated":"2023-11-24T06:27:25Z","published":"2023-11-24T06:27:25Z","title":"AdaMedGraph: Adaboosting Graph Neural Networks for Personalized Medicine","summary":" Precision medicine tailored to individual patients has gained significant\nattention in recent times. Machine learning techniques are now employed to\nprocess personalized data from various sources, including images, genetics, and\nassessments. These techniques have demonstrated good outcomes in many clinical\nprediction tasks. Notably, the approach of constructing graphs by linking\nsimilar patients and then applying graph neural networks (GNNs) stands out,\nbecause related information from analogous patients are aggregated and\nconsidered for prediction. However, selecting the appropriate edge feature to\ndefine patient similarity and construct the graph is challenging, given that\neach patient is depicted by high-dimensional features from diverse sources.\nPrevious studies rely on human expertise to select the edge feature, which is\nneither scalable nor efficient in pinpointing crucial edge features for complex\ndiseases. In this paper, we propose a novel algorithm named \\ours, which can\nautomatically select important features to construct multiple patient\nsimilarity graphs, and train GNNs based on these graphs as weak learners in\nadaptive boosting. \\ours{} is evaluated on two real-world medical scenarios and\nshows superiors performance.\n","authors":["Jie Lian","Xufang Luo","Caihua Shan","Dongqi Han","Varut Vardhanabhuti","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2311.14304v1.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 9 pages"},{"id":"http://arxiv.org/abs/2311.14301v1","updated":"2023-11-24T06:22:38Z","published":"2023-11-24T06:22:38Z","title":"GeoViT: A Versatile Vision Transformer Architecture for Geospatial Image\n Analysis","summary":" Greenhouse gases are pivotal drivers of climate change, necessitating precise\nquantification and source identification to foster mitigation strategies. We\nintroduce GeoViT, a compact vision transformer model adept in processing\nsatellite imagery for multimodal segmentation, classification, and regression\ntasks targeting CO2 and NO2 emissions. Leveraging GeoViT, we attain superior\naccuracy in estimating power generation rates, fuel type, plume coverage for\nCO2, and high-resolution NO2 concentration mapping, surpassing previous\nstate-of-the-art models while significantly reducing model size. GeoViT\ndemonstrates the efficacy of vision transformer architectures in harnessing\nsatellite-derived data for enhanced GHG emission insights, proving instrumental\nin advancing climate change monitoring and emission regulation efforts\nglobally.\n","authors":["Madhav Khirwar","Ankur Narang"],"pdf_url":"https://arxiv.org/pdf/2311.14301v1.pdf","comment":"Extended Abstract, Preprint"},{"id":"http://arxiv.org/abs/2309.15643v2","updated":"2023-11-24T06:20:25Z","published":"2023-09-27T13:29:38Z","title":"Why do Angular Margin Losses work well for Semi-Supervised Anomalous\n Sound Detection?","summary":" State-of-the-art anomalous sound detection systems often utilize angular\nmargin losses to learn suitable representations of acoustic data using an\nauxiliary task, which usually is a supervised or self-supervised classification\ntask. The underlying idea is that, in order to solve this auxiliary task,\nspecific information about normal data needs to be captured in the learned\nrepresentations and that this information is also sufficient to differentiate\nbetween normal and anomalous samples. Especially in noisy conditions,\ndiscriminative models based on angular margin losses tend to significantly\noutperform systems based on generative or one-class models. The goal of this\nwork is to investigate why using angular margin losses with auxiliary tasks\nworks well for detecting anomalous sounds. To this end, it is shown, both\ntheoretically and experimentally, that minimizing angular margin losses also\nminimizes compactness loss while inherently preventing learning trivial\nsolutions. Furthermore, multiple experiments are conducted to show that using a\nrelated classification task as an auxiliary task teaches the model to learn\nrepresentations suitable for detecting anomalous sounds in noisy conditions.\nAmong these experiments are performance evaluations, visualizing the embedding\nspace with t-SNE and visualizing the input representations with respect to the\nanomaly score using randomized input sampling for explanation.\n","authors":["Kevin Wilkinghoff","Frank Kurth"],"pdf_url":"https://arxiv.org/pdf/2309.15643v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02198v3","updated":"2023-11-24T06:08:20Z","published":"2023-11-03T19:03:20Z","title":"Imitation Bootstrapped Reinforcement Learning","summary":" Despite the considerable potential of reinforcement learning (RL), robotics\ncontrol tasks predominantly rely on imitation learning (IL) owing to its better\nsample efficiency. However, given the high cost of collecting extensive\ndemonstrations, RL is still appealing if it can utilize limited imitation data\nfor efficient autonomous self-improvement. Existing RL methods that utilize\ndemonstrations either initialize the replay buffer with demonstrations and\noversample them during RL training, which does not benefit from the\ngeneralization potential of modern IL methods, or pretrain the RL policy with\nIL on the demonstrations, which requires additional mechanisms to prevent\ncatastrophic forgetting during RL fine-tuning. We propose imitation\nbootstrapped reinforcement learning (IBRL), a novel framework that first trains\nan IL policy on a limited number of demonstrations and then uses it to propose\nalternative actions for both online exploration and target value bootstrapping.\nIBRL achieves SoTA performance and sample efficiency on 7 challenging sparse\nreward continuous control tasks in simulation while learning directly from\npixels. As a highlight of our method, IBRL achieves $6.4\\times$ higher success\nrate than RLPD, a strong method that combines the idea of oversampling\ndemonstrations with modern RL improvements, under the budget of 10 demos and\n100K interactions in the challenging PickPlaceCan task in the Robomimic\nbenchmark.\n","authors":["Hengyuan Hu","Suvir Mirchandani","Dorsa Sadigh"],"pdf_url":"https://arxiv.org/pdf/2311.02198v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.11588v3","updated":"2023-11-24T05:09:25Z","published":"2023-01-27T08:25:45Z","title":"Bounding Box-based Multi-objective Bayesian Optimization of Risk\n Measures under Input Uncertainty","summary":" In this study, we propose a novel multi-objective Bayesian optimization\n(MOBO) method to efficiently identify the Pareto front (PF) defined by risk\nmeasures for black-box functions under the presence of input uncertainty (IU).\nExisting BO methods for Pareto optimization in the presence of IU are\nrisk-specific or without theoretical guarantees, whereas our proposed method\naddresses general risk measures and has theoretical guarantees. The basic idea\nof the proposed method is to assume a Gaussian process (GP) model for the\nblack-box function and to construct high-probability bounding boxes for the\nrisk measures using the GP model. Furthermore, in order to reduce the\nuncertainty of non-dominated bounding boxes, we propose a method of selecting\nthe next evaluation point using a maximin distance defined by the maximum value\nof a quasi distance based on bounding boxes. As theoretical analysis, we prove\nthat the algorithm can return an arbitrary-accurate solution in a finite number\nof iterations with high probability, for various risk measures such as Bayes\nrisk, worst-case risk, and value-at-risk. We also give a theoretical analysis\nthat takes into account approximation errors because there exist non-negligible\napproximation errors (e.g., finite approximation of PFs and sampling-based\napproximation of bounding boxes) in practice. We confirm that the proposed\nmethod outperforms compared with existing methods not only in the setting with\nIU but also in the setting of ordinary MOBO through numerical experiments.\n","authors":["Yu Inatsu","Shion Takeno","Hiroyuki Hanada","Kazuki Iwata","Ichiro Takeuchi"],"pdf_url":"https://arxiv.org/pdf/2301.11588v3.pdf","comment":"39 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.14272v1","updated":"2023-11-24T04:16:32Z","published":"2023-11-24T04:16:32Z","title":"CRISP: Hybrid Structured Sparsity for Class-aware Model Pruning","summary":" Machine learning pipelines for classification tasks often train a universal\nmodel to achieve accuracy across a broad range of classes. However, a typical\nuser encounters only a limited selection of classes regularly. This disparity\nprovides an opportunity to enhance computational efficiency by tailoring models\nto focus on user-specific classes. Existing works rely on unstructured pruning,\nwhich introduces randomly distributed non-zero values in the model, making it\nunsuitable for hardware acceleration. Alternatively, some approaches employ\nstructured pruning, such as channel pruning, but these tend to provide only\nminimal compression and may lead to reduced model accuracy. In this work, we\npropose CRISP, a novel pruning framework leveraging a hybrid structured\nsparsity pattern that combines both fine-grained N:M structured sparsity and\ncoarse-grained block sparsity. Our pruning strategy is guided by a\ngradient-based class-aware saliency score, allowing us to retain weights\ncrucial for user-specific classes. CRISP achieves high accuracy with minimal\nmemory consumption for popular models like ResNet-50, VGG-16, and MobileNetV2\non ImageNet and CIFAR-100 datasets. Moreover, CRISP delivers up to 14$\\times$\nreduction in latency and energy consumption compared to existing pruning\nmethods while maintaining comparable accuracy. Our code is available at\nhttps://github.com/shivmgg/CRISP/.\n","authors":["Shivam Aggarwal","Kuluhan Binici","Tulika Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.14272v1.pdf","comment":"6 pages, accepted in Design, Automation & Test in Europe Conference &\n Exhibition (DATE) 2024"},{"id":"http://arxiv.org/abs/2311.14271v1","updated":"2023-11-24T04:15:10Z","published":"2023-11-24T04:15:10Z","title":"Segmentation-Based Parametric Painting","summary":" We introduce a novel image-to-painting method that facilitates the creation\nof large-scale, high-fidelity paintings with human-like quality and stylistic\nvariation. To process large images and gain control over the painting process,\nwe introduce a segmentation-based painting process and a dynamic attention map\napproach inspired by human painting strategies, allowing optimization of brush\nstrokes to proceed in batches over different image regions, thereby capturing\nboth large-scale structure and fine details, while also allowing stylistic\ncontrol over detail. Our optimized batch processing and patch-based loss\nframework enable efficient handling of large canvases, ensuring our painted\noutputs are both aesthetically compelling and functionally superior as compared\nto previous methods, as confirmed by rigorous evaluations. Code available at:\nhttps://github.com/manuelladron/semantic\\_based\\_painting.git\n","authors":["Manuel Ladron de Guevara","Matthew Fisher","Aaron Hertzmann"],"pdf_url":"https://arxiv.org/pdf/2311.14271v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2303.16210v4","updated":"2023-11-24T03:56:54Z","published":"2023-03-28T05:10:57Z","title":"Towards Reliable Uncertainty Quantification via Deep Ensembles in\n Multi-output Regression Task","summary":" This study aims to comprehensively investigate the deep ensemble approach, an\napproximate Bayesian inference, in the multi-output regression task for\npredicting the aerodynamic performance of a missile configuration. To this end,\nthe effect of the number of neural networks used in the ensemble, which has\nbeen blindly adopted in previous studies, is scrutinized. As a result, an\nobvious trend towards underestimation of uncertainty as it increases is\nobserved for the first time, and in this context, we propose the deep ensemble\nframework that applies the post-hoc calibration method to improve its\nuncertainty quantification performance. It is compared with Gaussian process\nregression and is shown to have superior performance in terms of regression\naccuracy ($\\uparrow55\\sim56\\%$), reliability of estimated uncertainty\n($\\uparrow38\\sim77\\%$), and training efficiency ($\\uparrow78\\%$). Finally, the\npotential impact of the suggested framework on the Bayesian optimization is\nbriefly examined, indicating that deep ensemble without calibration may lead to\nunintended exploratory behavior. This UQ framework can be seamlessly applied\nand extended to any regression task, as no special assumptions have been made\nfor the specific problem used in this study.\n","authors":["Sunwoong Yang","Kwanjung Yee"],"pdf_url":"https://arxiv.org/pdf/2303.16210v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12727v2","updated":"2023-11-24T03:27:31Z","published":"2023-11-21T17:03:21Z","title":"Soft Random Sampling: A Theoretical and Empirical Analysis","summary":" Soft random sampling (SRS) is a simple yet effective approach for efficient\ntraining of large-scale deep neural networks when dealing with massive data.\nSRS selects a subset uniformly at random with replacement from the full data\nset in each epoch. In this paper, we conduct a theoretical and empirical\nanalysis of SRS. First, we analyze its sampling dynamics including data\ncoverage and occupancy. Next, we investigate its convergence with non-convex\nobjective functions and give the convergence rate. Finally, we provide its\ngeneralization performance. We empirically evaluate SRS for image recognition\non CIFAR10 and automatic speech recognition on Librispeech and an in-house\npayload dataset to demonstrate its effectiveness. Compared to existing\ncoreset-based data selection methods, SRS offers a better accuracy-efficiency\ntrade-off. Especially on real-world industrial scale data sets, it is shown to\nbe a powerful training strategy with significant speedup and competitive\nperformance with almost no additional computing cost.\n","authors":["Xiaodong Cui","Ashish Mittal","Songtao Lu","Wei Zhang","George Saon","Brian Kingsbury"],"pdf_url":"https://arxiv.org/pdf/2311.12727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16483v2","updated":"2023-11-24T03:07:39Z","published":"2023-08-31T06:44:42Z","title":"Improving Out-of-Distribution Detection in Echocardiographic View\n Classication through Enhancing Semantic Features","summary":" In echocardiographic view classification, accurately detecting\nout-of-distribution (OOD) data is essential but challenging, especially given\nthe subtle differences between in-distribution and OOD data. While conventional\nOOD detection methods, such as Mahalanobis distance (MD) are effective in\nfar-OOD scenarios with clear distinctions between distributions, they struggle\nto discern the less obvious variations characteristic of echocardiographic\ndata. In this study, we introduce a novel use of label smoothing to enhance\nsemantic feature representation in echocardiographic images, demonstrating that\nthese enriched semantic features are key for significantly improving near-OOD\ninstance detection. By combining label smoothing with MD-based OOD detection,\nwe establish a new benchmark for accuracy in echocardiographic OOD detection.\n","authors":["Jaeik Jeon","Seongmin Ha","Yeonggul Jang","Yeonyee E. Yoon","Jiyeon Kim","Hyunseok Jeong","Dawun Jeong","Youngtaek Hong","Seung-Ah Lee Hyuk-Jae Chang"],"pdf_url":"https://arxiv.org/pdf/2308.16483v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11995v3","updated":"2023-11-24T02:51:30Z","published":"2023-11-20T18:26:01Z","title":"BrainWash: A Poisoning Attack to Forget in Continual Learning","summary":" Continual learning has gained substantial attention within the deep learning\ncommunity, offering promising solutions to the challenging problem of\nsequential learning. Yet, a largely unexplored facet of this paradigm is its\nsusceptibility to adversarial attacks, especially with the aim of inducing\nforgetting. In this paper, we introduce \"BrainWash,\" a novel data poisoning\nmethod tailored to impose forgetting on a continual learner. By adding the\nBrainWash noise to a variety of baselines, we demonstrate how a trained\ncontinual learner can be induced to forget its previously learned tasks\ncatastrophically, even when using these continual learning baselines. An\nimportant feature of our approach is that the attacker requires no access to\nprevious tasks' data and is armed merely with the model's current parameters\nand the data belonging to the most recent task. Our extensive experiments\nhighlight the efficacy of BrainWash, showcasing degradation in performance\nacross various regularization-based continual learning methods.\n","authors":["Ali Abbasi","Parsa Nooralinejad","Hamed Pirsiavash","Soheil Kolouri"],"pdf_url":"https://arxiv.org/pdf/2311.11995v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14255v1","updated":"2023-11-24T02:42:42Z","published":"2023-11-24T02:42:42Z","title":"Out-of-Distribution Generalized Dynamic Graph Neural Network with\n Disentangled Intervention and Invariance Promotion","summary":" Dynamic graph neural networks (DyGNNs) have demonstrated powerful predictive\nabilities by exploiting graph structural and temporal dynamics. However, the\nexisting DyGNNs fail to handle distribution shifts, which naturally exist in\ndynamic graphs, mainly because the patterns exploited by DyGNNs may be variant\nwith respect to labels under distribution shifts. In this paper, we propose\nDisentangled Intervention-based Dynamic graph Attention networks with\nInvariance Promotion (I-DIDA) to handle spatio-temporal distribution shifts in\ndynamic graphs by discovering and utilizing invariant patterns, i.e.,\nstructures and features whose predictive abilities are stable across\ndistribution shifts. Specifically, we first propose a disentangled\nspatio-temporal attention network to capture the variant and invariant\npatterns. By utilizing the disentangled patterns, we design a spatio-temporal\nintervention mechanism to create multiple interventional distributions and an\nenvironment inference module to infer the latent spatio-temporal environments,\nand minimize the variance of predictions among these intervened distributions\nand environments, so that our model can make predictions based on invariant\npatterns with stable predictive abilities under distribution shifts. Extensive\nexperiments demonstrate the superiority of our method over state-of-the-art\nbaselines under distribution shifts. Our work is the first study of\nspatio-temporal distribution shifts in dynamic graphs, to the best of our\nknowledge.\n","authors":["Zeyang Zhang","Xin Wang","Ziwei Zhang","Haoyang Li","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.14255v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17623v2","updated":"2023-11-24T01:45:16Z","published":"2023-10-26T17:43:13Z","title":"Proving Test Set Contamination in Black Box Language Models","summary":" Large language models are trained on vast amounts of internet data, prompting\nconcerns and speculation that they have memorized public benchmarks. Going from\nspeculation to proof of contamination is challenging, as the pretraining data\nused by proprietary models are often not publicly accessible. We show that it\nis possible to provide provable guarantees of test set contamination in\nlanguage models without access to pretraining data or model weights. Our\napproach leverages the fact that when there is no data contamination, all\norderings of an exchangeable benchmark should be equally likely. In contrast,\nthe tendency for language models to memorize example order means that a\ncontaminated language model will find certain canonical orderings to be much\nmore likely than others. Our test flags potential contamination whenever the\nlikelihood of a canonically ordered benchmark dataset is significantly higher\nthan the likelihood after shuffling the examples. We demonstrate that our\nprocedure is sensitive enough to reliably prove test set contamination in\nchallenging situations, including models as small as 1.4 billion parameters, on\nsmall test sets of only 1000 examples, and datasets that appear only a few\ntimes in the pretraining corpus. Using our test, we audit five popular publicly\naccessible language models for test set contamination and find little evidence\nfor pervasive contamination.\n","authors":["Yonatan Oren","Nicole Meister","Niladri Chatterji","Faisal Ladhak","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2310.17623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.14483v3","updated":"2023-11-24T00:58:48Z","published":"2023-03-25T14:29:20Z","title":"Spatio-Temporal Graph Neural Networks for Predictive Learning in Urban\n Computing: A Survey","summary":" With recent advances in sensing technologies, a myriad of spatio-temporal\ndata has been generated and recorded in smart cities. Forecasting the evolution\npatterns of spatio-temporal data is an important yet demanding aspect of urban\ncomputing, which can enhance intelligent management decisions in various\nfields, including transportation, environment, climate, public safety,\nhealthcare, and others. Traditional statistical and deep learning methods\nstruggle to capture complex correlations in urban spatio-temporal data. To this\nend, Spatio-Temporal Graph Neural Networks (STGNN) have been proposed,\nachieving great promise in recent years. STGNNs enable the extraction of\ncomplex spatio-temporal dependencies by integrating graph neural networks\n(GNNs) and various temporal learning methods. In this manuscript, we provide a\ncomprehensive survey on recent progress on STGNN technologies for predictive\nlearning in urban computing. Firstly, we provide a brief introduction to the\nconstruction methods of spatio-temporal graph data and the prevalent\ndeep-learning architectures used in STGNNs. We then sort out the primary\napplication domains and specific predictive learning tasks based on existing\nliterature. Afterward, we scrutinize the design of STGNNs and their combination\nwith some advanced technologies in recent years. Finally, we conclude the\nlimitations of existing research and suggest potential directions for future\nwork.\n","authors":["Guangyin Jin","Yuxuan Liang","Yuchen Fang","Zezhi Shao","Jincai Huang","Junbo Zhang","Yu Zheng"],"pdf_url":"https://arxiv.org/pdf/2303.14483v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.07628v2","updated":"2023-11-24T00:38:52Z","published":"2021-12-14T18:13:36Z","title":"Training Multi-Layer Over-Parametrized Neural Network in Subquadratic\n Time","summary":" We consider the problem of training a multi-layer over-parametrized neural\nnetwork to minimize the empirical risk induced by a loss function. In the\ntypical setting of over-parametrization, the network width $m$ is much larger\nthan the data dimension $d$ and the number of training samples $n$\n($m=\\mathrm{poly}(n,d)$), which induces a prohibitive large weight matrix $W\\in\n\\mathbb{R}^{m\\times m}$ per layer. Naively, one has to pay $O(m^2)$ time to\nread the weight matrix and evaluate the neural network function in both forward\nand backward computation. In this work, we show how to reduce the training cost\nper iteration. Specifically, we propose a framework that uses $m^2$ cost only\nin the initialization phase and achieves \\emph{a truly subquadratic cost per\niteration} in terms of $m$, i.e., $m^{2-\\Omega(1)}$ per iteration. Our result\nhas implications beyond standard over-parametrization theory, as it can be\nviewed as designing an efficient data structure on top of a pre-trained large\nmodel to further speed up the fine-tuning process, a core procedure to deploy\nlarge language models (LLM).\n","authors":["Zhao Song","Lichen Zhang","Ruizhe Zhang"],"pdf_url":"https://arxiv.org/pdf/2112.07628v2.pdf","comment":"ITCS 2024"},{"id":"http://arxiv.org/abs/2311.14237v1","updated":"2023-11-24T00:36:17Z","published":"2023-11-24T00:36:17Z","title":"Pseudo-label Correction for Instance-dependent Noise Using\n Teacher-student Framework","summary":" The high capacity of deep learning models to learn complex patterns poses a\nsignificant challenge when confronted with label noise. The inability to\ndifferentiate clean and noisy labels ultimately results in poor generalization.\nWe approach this problem by reassigning the label for each image using a new\nteacher-student based framework termed P-LC (pseudo-label correction).\nTraditional teacher-student networks are composed of teacher and student\nclassifiers for knowledge distillation. In our novel approach, we reconfigure\nthe teacher network into a triple encoder, leveraging the triplet loss to\nestablish a pseudo-label correction system. As the student generates pseudo\nlabels for a set of given images, the teacher learns to choose between the\ninitially assigned labels and the pseudo labels. Experiments on MNIST,\nFashion-MNIST, and SVHN demonstrate P-LC's superior performance over existing\nstate-of-the-art methods across all noise levels, most notably in high noise.\nIn addition, we introduce a noise level estimation to help assess model\nperformance and inform the need for additional data cleaning procedures.\n","authors":["Eugene Kim"],"pdf_url":"https://arxiv.org/pdf/2311.14237v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09384v2","updated":"2023-11-24T00:25:38Z","published":"2023-09-17T21:43:18Z","title":"Mitigating Over-Smoothing and Over-Squashing using Augmentations of\n Forman-Ricci Curvature","summary":" While Graph Neural Networks (GNNs) have been successfully leveraged for\nlearning on graph-structured data across domains, several potential pitfalls\nhave been described recently. Those include the inability to accurately\nleverage information encoded in long-range connections (over-squashing), as\nwell as difficulties distinguishing the learned representations of nearby nodes\nwith growing network depth (over-smoothing). An effective way to characterize\nboth effects is discrete curvature: Long-range connections that underlie\nover-squashing effects have low curvature, whereas edges that contribute to\nover-smoothing have high curvature. This observation has given rise to rewiring\ntechniques, which add or remove edges to mitigate over-smoothing and\nover-squashing. Several rewiring approaches utilizing graph characteristics,\nsuch as curvature or the spectrum of the graph Laplacian, have been proposed.\nHowever, existing methods, especially those based on curvature, often require\nexpensive subroutines and careful hyperparameter tuning, which limits their\napplicability to large-scale graphs. Here we propose a rewiring technique based\non Augmented Forman-Ricci curvature (AFRC), a scalable curvature notation,\nwhich can be computed in linear time. We prove that AFRC effectively\ncharacterizes over-smoothing and over-squashing effects in message-passing\nGNNs. We complement our theoretical results with experiments, which demonstrate\nthat the proposed approach achieves state-of-the-art performance while\nsignificantly reducing the computational cost in comparison with other methods.\nUtilizing fundamental properties of discrete curvature, we propose effective\nheuristics for hyperparameters in curvature-based rewiring, which avoids\nexpensive hyperparameter searches, further improving the scalability of the\nproposed approach.\n","authors":["Lukas Fesser","Melanie Weber"],"pdf_url":"https://arxiv.org/pdf/2309.09384v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01017v2","updated":"2023-11-24T00:24:06Z","published":"2023-11-02T06:21:56Z","title":"Learning Unsupervised World Models for Autonomous Driving via Discrete\n Diffusion","summary":" Learning world models can teach an agent how the world works in an\nunsupervised manner. Even though it can be viewed as a special case of sequence\nmodeling, progress for scaling world models on robotic applications such as\nautonomous driving has been somewhat less rapid than scaling language models\nwith Generative Pre-trained Transformers (GPT). We identify two reasons as\nmajor bottlenecks: dealing with complex and unstructured observation space, and\nhaving a scalable generative model. Consequently, we propose a novel world\nmodeling approach that first tokenizes sensor observations with VQVAE, then\npredicts the future via discrete diffusion. To efficiently decode and denoise\ntokens in parallel, we recast Masked Generative Image Transformer into the\ndiscrete diffusion framework with a few simple changes, resulting in notable\nimprovement. When applied to learning world models on point cloud observations,\nour model reduces prior SOTA Chamfer distance by more than 65% for 1s\nprediction, and more than 50% for 3s prediction, across NuScenes, KITTI\nOdometry, and Argoverse2 datasets. Our results demonstrate that discrete\ndiffusion on tokenized agent experience can unlock the power of GPT-like\nunsupervised learning for robotic agents.\n","authors":["Lunjun Zhang","Yuwen Xiong","Ze Yang","Sergio Casas","Rui Hu","Raquel Urtasun"],"pdf_url":"https://arxiv.org/pdf/2311.01017v2.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2306.07848v9","updated":"2023-11-24T15:04:50Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Accurate Speech Emotion Recognition","summary":" Contrastive cross-modality pretraining has recently exhibited impressive\nsuccess in diverse fields, whereas there is limited research on their merits in\nspeech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind\nof gender-attribute-enhanced contrastive language-audio pretraining (CLAP)\nmethod for SER. Specifically, we first construct an effective emotion CLAP\n(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given\nthe significance of gender information in SER, two novel multi-task learning\nbased GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP)\nmodels are further proposed to incorporate gender information of speech\nsignals, forming more reasonable objectives. Experiments on IEMOCAP indicate\nthat our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with\ndifferent pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP\nobtains the best UAR of 81.43\\% and WAR of 83.16\\%, which performs better than\nstate-of-the-art SER methods.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Wen Fei","Jixun Yao","Heng Lu","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.07848v9.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2311.12401v2","updated":"2023-11-24T08:51:13Z","published":"2023-11-21T07:28:51Z","title":"CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal\n Relationships","summary":" Integrating deep learning and causal discovery has increased the\ninterpretability of Temporal Action Segmentation (TAS) tasks. However,\nframe-level causal relationships exist many complicated noises outside the\nsegment-level, making it infeasible to directly express macro action semantics.\nThus, we propose Causal Abstraction Segmentation Refiner (CASR), which can\nrefine TAS results from various models by enhancing video causality in\nmarginalizing frame-level casual relationships. Specifically, we define the\nequivalent frame-level casual model and segment-level causal model, so that the\ncausal adjacency matrix constructed from marginalized frame-level causal\nrelationships has the ability to represent the segmnet-level causal\nrelationships. CASR works out by reducing the difference in the causal\nadjacency matrix between we constructed and pre-segmentation results of\nbackbone models. In addition, we propose a novel evaluation metric Causal Edit\nDistance (CED) to evaluate the causal interpretability. Extensive experimental\nresults on mainstream datasets indicate that CASR significantly surpasses\nexisting various methods in action segmentation performance, as well as in\ncausal explainability and generalization.\n","authors":["Keqing Du","Xinyu Yang","Hang Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12401v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15064v2","updated":"2023-11-24T02:58:58Z","published":"2023-07-27T17:59:59Z","title":"Self-Supervised Visual Acoustic Matching","summary":" Acoustic matching aims to re-synthesize an audio clip to sound as if it were\nrecorded in a target acoustic environment. Existing methods assume access to\npaired training data, where the audio is observed in both source and target\nenvironments, but this limits the diversity of training data or requires the\nuse of simulated data or heuristics to create paired samples. We propose a\nself-supervised approach to visual acoustic matching where training samples\ninclude only the target scene image and audio -- without acoustically\nmismatched source audio for reference. Our approach jointly learns to\ndisentangle room acoustics and re-synthesize audio into the target environment,\nvia a conditional GAN framework and a novel metric that quantifies the level of\nresidual acoustic information in the de-biased audio. Training with either\nin-the-wild web data or simulated data, we demonstrate it outperforms the\nstate-of-the-art on multiple challenging datasets and a wide variety of\nreal-world audio and environments.\n","authors":["Arjun Somayazulu","Changan Chen","Kristen Grauman"],"pdf_url":"https://arxiv.org/pdf/2307.15064v2.pdf","comment":"Project page: https://vision.cs.utexas.edu/projects/ss_vam/ .\n Accepted at NeurIPS 2023"}]},"2023-11-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.14212v1","updated":"2023-11-23T21:54:22Z","published":"2023-11-23T21:54:22Z","title":"Annotation Sensitivity: Training Data Collection Methods Affect Model\n Performance","summary":" When training data are collected from human annotators, the design of the\nannotation instrument, the instructions given to annotators, the\ncharacteristics of the annotators, and their interactions can impact training\ndata. This study demonstrates that design choices made when creating an\nannotation instrument also impact the models trained on the resulting\nannotations.\n We introduce the term annotation sensitivity to refer to the impact of\nannotation data collection methods on the annotations themselves and on\ndownstream model performance and predictions.\n We collect annotations of hate speech and offensive language in five\nexperimental conditions of an annotation instrument, randomly assigning\nannotators to conditions. We then fine-tune BERT models on each of the five\nresulting datasets and evaluate model performance on a holdout portion of each\ncondition. We find considerable differences between the conditions for 1) the\nshare of hate speech/offensive language annotations, 2) model performance, 3)\nmodel predictions, and 4) model learning curves.\n Our results emphasize the crucial role played by the annotation instrument\nwhich has received little attention in the machine learning literature. We call\nfor additional research into how and why the instrument impacts the annotations\nto inform the development of best practices in instrument design.\n","authors":["Christoph Kern","Stephanie Eckman","Jacob Beck","Rob Chew","Bolei Ma","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2311.14212v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.14199v1","updated":"2023-11-23T20:52:44Z","published":"2023-11-23T20:52:44Z","title":"A Systematic Review of Deep Learning-based Research on Radiology Report\n Generation","summary":" Radiology report generation (RRG) aims to automatically generate free-text\ndescriptions from clinical radiographs, e.g., chest X-Ray images. RRG plays an\nessential role in promoting clinical automation and presents significant help\nto provide practical assistance for inexperienced doctors and alleviate\nradiologists' workloads. Therefore, consider these meaningful potentials,\nresearch on RRG is experiencing explosive growth in the past half-decade,\nespecially with the rapid development of deep learning approaches. Existing\nstudies perform RRG from the perspective of enhancing different modalities,\nprovide insights on optimizing the report generation process with elaborated\nfeatures from both visual and textual information, and further facilitate RRG\nwith the cross-modal interactions among them. In this paper, we present a\ncomprehensive review of deep learning-based RRG from various perspectives.\nSpecifically, we firstly cover pivotal RRG approaches based on the\ntask-specific features of radiographs, reports, and the cross-modal relations\nbetween them, and then illustrate the benchmark datasets conventionally used\nfor this task with evaluation metrics, subsequently analyze the performance of\ndifferent approaches and finally offer our summary on the challenges and the\ntrends in future directions. Overall, the goal of this paper is to serve as a\ntool for understanding existing literature and inspiring potential valuable\nresearch in the field of RRG.\n","authors":["Chang Liu","Yuanhe Tian","Yan Song"],"pdf_url":"https://arxiv.org/pdf/2311.14199v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2307.06435v6","updated":"2023-11-23T19:23:19Z","published":"2023-07-12T20:01:52Z","title":"A Comprehensive Overview of Large Language Models","summary":" Large Language Models (LLMs) have recently demonstrated remarkable\ncapabilities in natural language processing tasks and beyond. This success of\nLLMs has led to a large influx of research contributions in this direction.\nThese works encompass diverse topics such as architectural innovations, better\ntraining strategies, context length improvements, fine-tuning, multi-modal\nLLMs, robotics, datasets, benchmarking, efficiency, and more. With the rapid\ndevelopment of techniques and regular breakthroughs in LLM research, it has\nbecome considerably challenging to perceive the bigger picture of the advances\nin this direction. Considering the rapidly emerging plethora of literature on\nLLMs, it is imperative that the research community is able to benefit from a\nconcise yet comprehensive overview of the recent developments in this field.\nThis article provides an overview of the existing literature on a broad range\nof LLM-related concepts. Our self-contained comprehensive overview of LLMs\ndiscusses relevant background concepts along with covering the advanced topics\nat the frontier of research in LLMs. This review article is intended to not\nonly provide a systematic survey but also a quick comprehensive reference for\nthe researchers and practitioners to draw insights from extensive informative\nsummaries of the existing works to advance the LLM research.\n","authors":["Humza Naveed","Asad Ullah Khan","Shi Qiu","Muhammad Saqib","Saeed Anwar","Muhammad Usman","Naveed Akhtar","Nick Barnes","Ajmal Mian"],"pdf_url":"https://arxiv.org/pdf/2307.06435v6.pdf","comment":"Work in-progress"},{"id":"http://arxiv.org/abs/2311.14169v1","updated":"2023-11-23T19:20:59Z","published":"2023-11-23T19:20:59Z","title":"Evaluating GPT-4's Vision Capabilities on Brazilian University Admission\n Exams","summary":" Recent advancements in language models have showcased human-comparable\nperformance in academic entrance exams. However, existing studies often\noverlook questions that require the integration of visual comprehension, thus\ncompromising the full spectrum and complexity inherent in real-world scenarios.\nTo address this gap, we present a comprehensive framework to evaluate language\nmodels on entrance exams, which incorporates both textual and visual elements.\nWe evaluate the two most recent editions of Exame Nacional do Ensino M\\'edio\n(ENEM), the main standardized entrance examination adopted by Brazilian\nuniversities. Our study not only reaffirms the capabilities of GPT-4 as the\nstate of the art for handling complex multidisciplinary questions, but also\npioneers in offering a realistic assessment of multimodal language models on\nPortuguese examinations. One of the highlights is that text captions\ntranscribing visual content outperform the direct use of images, suggesting\nthat the vision model has room for improvement. Yet, despite improvements\nafforded by images or captions, mathematical questions remain a challenge for\nthese state-of-the-art models. The code and data used on experiments are\navailable at https://github.com/piresramon/gpt-4-enem.\n","authors":["Ramon Pires","Thales Sales Almeida","Hugo Abonizio","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2311.14169v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.17003"},{"id":"http://arxiv.org/abs/2311.14126v1","updated":"2023-11-23T17:47:14Z","published":"2023-11-23T17:47:14Z","title":"Towards Auditing Large Language Models: Improving Text-based Stereotype\n Detection","summary":" Large Language Models (LLM) have made significant advances in the recent past\nbecoming more mainstream in Artificial Intelligence (AI) enabled human-facing\napplications. However, LLMs often generate stereotypical output inherited from\nhistorical data, amplifying societal biases and raising ethical concerns. This\nwork introduces i) the Multi-Grain Stereotype Dataset, which includes 52,751\ninstances of gender, race, profession and religion stereotypic text and ii) a\nnovel stereotype classifier for English text. We design several experiments to\nrigorously test the proposed model trained on the novel dataset. Our\nexperiments show that training the model in a multi-class setting can\noutperform the one-vs-all binary counterpart. Consistent feature importance\nsignals from different eXplainable AI tools demonstrate that the new model\nexploits relevant text features. We utilise the newly created model to assess\nthe stereotypic behaviour of the popular GPT family of models and observe the\nreduction of bias over time. In summary, our work establishes a robust and\npractical framework for auditing and evaluating the stereotypic bias in LLM.\n","authors":["Wu Zekun","Sahan Bulathwela","Adriano Soares Koshiyama"],"pdf_url":"https://arxiv.org/pdf/2311.14126v1.pdf","comment":"2023 NeurIPS SoLaR Workshop Accepted"},{"id":"http://arxiv.org/abs/2311.14115v1","updated":"2023-11-23T17:20:36Z","published":"2023-11-23T17:20:36Z","title":"A density estimation perspective on learning from pairwise human\n preferences","summary":" Learning from human feedback (LHF) -- and in particular learning from\npairwise preferences -- has recently become a crucial ingredient in training\nlarge language models (LLMs), and has been the subject of much research. Most\nrecent works frame it as a reinforcement learning problem, where a reward\nfunction is learned from pairwise preference data and the LLM is treated as a\npolicy which is adapted to maximize the rewards, often under additional\nregularization constraints. We propose an alternative interpretation which\ncenters on the generative process for pairwise preferences and treats LHF as a\ndensity estimation problem. We provide theoretical and empirical results\nshowing that for a family of generative processes defined via preference\nbehavior distribution equations, training a reward function on pairwise\npreferences effectively models an annotator's implicit preference distribution.\nFinally, we discuss and present findings on \"annotator misspecification\" --\nfailure cases where wrong modeling assumptions are made about annotator\nbehavior, resulting in poorly-adapted models -- suggesting that approaches that\nlearn from pairwise human preferences could have trouble learning from a\npopulation of annotators with diverse viewpoints.\n","authors":["Vincent Dumoulin","Daniel D. Johnson","Pablo Samuel Castro","Hugo Larochelle","Yann Dauphin"],"pdf_url":"https://arxiv.org/pdf/2311.14115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14096v1","updated":"2023-11-23T16:45:56Z","published":"2023-11-23T16:45:56Z","title":"Auditing and Mitigating Cultural Bias in LLMs","summary":" Culture fundamentally shapes people's reasoning, behavior, and communication.\nGenerative artificial intelligence (AI) technologies may cause a shift towards\na dominant culture. As people increasingly use AI to expedite and even automate\nvarious professional and personal tasks, cultural values embedded in AI models\nmay bias authentic expression. We audit large language models for cultural\nbias, comparing their responses to nationally representative survey data, and\nevaluate country-specific prompting as a mitigation strategy. We find that\nGPT-4, 3.5 and 3 exhibit cultural values resembling English-speaking and\nProtestant European countries. Our mitigation strategy reduces cultural bias in\nrecent models but not for all countries/territories. To avoid cultural bias in\ngenerative AI, especially in high-stakes contexts, we suggest using culture\nmatching and ongoing cultural audits.\n","authors":["Yan Tao","Olga Viberg","Ryan S. Baker","Rene F. Kizilcec"],"pdf_url":"https://arxiv.org/pdf/2311.14096v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14087v1","updated":"2023-11-23T16:26:24Z","published":"2023-11-23T16:26:24Z","title":"Question Answering in Natural Language: the Special Case of Temporal\n Expressions","summary":" Although general question answering has been well explored in recent years,\ntemporal question answering is a task which has not received as much focus. Our\nwork aims to leverage a popular approach used for general question answering,\nanswer extraction, in order to find answers to temporal questions within a\nparagraph. To train our model, we propose a new dataset, inspired by SQuAD,\nspecifically tailored to provide rich temporal information. We chose to adapt\nthe corpus WikiWars, which contains several documents on history's greatest\nconflicts. Our evaluation shows that a deep learning model trained to perform\npattern matching, often used in general question answering, can be adapted to\ntemporal question answering, if we accept to ask questions whose answers must\nbe directly present within a text.\n","authors":["Armand Stricker"],"pdf_url":"https://arxiv.org/pdf/2311.14087v1.pdf","comment":"Accepted at Student Research Workshop associated with RANLP-2021"},{"id":"http://arxiv.org/abs/2311.14076v1","updated":"2023-11-23T16:08:39Z","published":"2023-11-23T16:08:39Z","title":"Searching for Snippets of Open-Domain Dialogue in Task-Oriented Dialogue\n Datasets","summary":" Most existing dialogue corpora and models have been designed to fit into 2\npredominant categories : task-oriented dialogues portray functional goals, such\nas making a restaurant reservation or booking a plane ticket, while\nchit-chat/open-domain dialogues focus on holding a socially engaging talk with\na user. However, humans tend to seamlessly switch between modes and even use\nchitchat to enhance task-oriented conversations. To bridge this gap, new\ndatasets have recently been created, blending both communication modes into\nconversation examples. The approaches used tend to rely on adding chit-chat\nsnippets to pre-existing, human-generated task-oriented datasets. Given the\ntendencies observed in humans, we wonder however if the latter do not\n\\textit{already} hold chit-chat sequences. By using topic modeling and\nsearching for topics which are most similar to a set of keywords related to\nsocial talk, we explore the training sets of Schema-Guided Dialogues and\nMultiWOZ. Our study shows that sequences related to social talk are indeed\nnaturally present, motivating further research on ways chitchat is combined\ninto task-oriented dialogues.\n","authors":["Armand Stricker","Patrick Paroubek"],"pdf_url":"https://arxiv.org/pdf/2311.14076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14067v1","updated":"2023-11-23T15:50:42Z","published":"2023-11-23T15:50:42Z","title":"Enhancing Task-Oriented Dialogues with Chitchat: a Comparative Study\n Based on Lexical Diversity and Divergence","summary":" As a recent development, task-oriented dialogues (TODs) have been enriched\nwith chitchat in an effort to make dialogues more diverse and engaging. This\nenhancement is particularly valuable as TODs are often confined to narrow\ndomains, making the mitigation of repetitive and predictable responses a\nsignificant challenge. This paper presents a comparative analysis of three\nchitchat enhancements, aiming to identify the most effective approach in terms\nof diversity. Additionally, we quantify the divergence between the added\nchitchat, the original task-oriented language, and chitchat typically found in\nchitchat datasets, highlighting the top 20 divergent keywords for each\ncomparison. Our findings drive a discussion on future enhancements for\naugmenting TODs, emphasizing the importance of grounding dialogues beyond the\ntask to achieve more diverse and natural exchanges.\n","authors":["Armand Stricker","Patrick Paroubek"],"pdf_url":"https://arxiv.org/pdf/2311.14067v1.pdf","comment":"Accepted at ASRU 2023"},{"id":"http://arxiv.org/abs/2305.13455v3","updated":"2023-11-23T15:47:52Z","published":"2023-05-22T19:56:10Z","title":"Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as\n Conversational Agents","summary":" Recent work has proposed a methodology for the systematic evaluation of\n\"Situated Language Understanding Agents\"-agents that operate in rich linguistic\nand non-linguistic contexts-through testing them in carefully constructed\ninteractive settings. Other recent work has argued that Large Language Models\n(LLMs), if suitably set up, can be understood as (simulators of) such agents. A\nconnection suggests itself, which this paper explores: Can LLMs be evaluated\nmeaningfully by exposing them to constrained game-like settings that are built\nto challenge specific capabilities? As a proof of concept, this paper\ninvestigates five interaction settings, showing that current chat-optimised\nLLMs are, to an extent, capable to follow game-play instructions. Both this\ncapability and the quality of the game play, measured by how well the\nobjectives of the different games are met, follows the development cycle, with\nnewer models performing better. The metrics even for the comparatively simple\nexample games are far from being saturated, suggesting that the proposed\ninstrument will remain to have diagnostic value. Our general framework for\nimplementing and evaluating games with LLMs is available at\nhttps://github.com/clembench .\n","authors":["Kranti Chalamalasetti","Jana Götze","Sherzod Hakimov","Brielen Madureira","Philipp Sadler","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2305.13455v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.14063v1","updated":"2023-11-23T15:42:00Z","published":"2023-11-23T15:42:00Z","title":"Do VSR Models Generalize Beyond LRS3?","summary":" The Lip Reading Sentences-3 (LRS3) benchmark has primarily been the focus of\nintense research in visual speech recognition (VSR) during the last few years.\nAs a result, there is an increased risk of overfitting to its excessively used\ntest set, which is only one hour duration. To alleviate this issue, we build a\nnew VSR test set named WildVSR, by closely following the LRS3 dataset creation\nprocesses. We then evaluate and analyse the extent to which the current VSR\nmodels generalize to the new test data. We evaluate a broad range of publicly\navailable VSR models and find significant drops in performance on our test set,\ncompared to their corresponding LRS3 results. Our results suggest that the\nincrease in word error rates is caused by the models inability to generalize to\nslightly harder and in the wild lip sequences than those found in the LRS3 test\nset. Our new test benchmark is made public in order to enable future research\ntowards more robust VSR models.\n","authors":["Yasser Abdelaziz Dahou Djilali","Sanath Narayan","Eustache Le Bihan","Haithem Boussaid","Ebtessam Almazrouei","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2311.14063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13987v1","updated":"2023-11-23T13:13:48Z","published":"2023-11-23T13:13:48Z","title":"Jam-ALT: A Formatting-Aware Lyrics Transcription Benchmark","summary":" Current automatic lyrics transcription (ALT) benchmarks focus exclusively on\nword content and ignore the finer nuances of written lyrics including\nformatting and punctuation, which leads to a potential misalignment with the\ncreative products of musicians and songwriters as well as listeners'\nexperiences. For example, line breaks are important in conveying information\nabout rhythm, emotional emphasis, rhyme, and high-level structure. To address\nthis issue, we introduce Jam-ALT, a new lyrics transcription benchmark based on\nthe JamendoLyrics dataset. Our contribution is twofold. Firstly, a complete\nrevision of the transcripts, geared specifically towards ALT evaluation by\nfollowing a newly created annotation guide that unifies the music industry's\nguidelines, covering aspects such as punctuation, line breaks, spelling,\nbackground vocals, and non-word sounds. Secondly, a suite of evaluation metrics\ndesigned, unlike the traditional word error rate, to capture such phenomena. We\nhope that the proposed benchmark contributes to the ALT task, enabling more\nprecise and reliable assessments of transcription systems and enhancing the\nuser experience in lyrics applications such as subtitle renderings for live\ncaptioning or karaoke.\n","authors":["Ondřej Cífka","Constantinos Dimitriou","Cheng-i Wang","Hendrik Schreiber","Luke Miner","Fabian-Robert Stöter"],"pdf_url":"https://arxiv.org/pdf/2311.13987v1.pdf","comment":"6 pages (3 pages main content); website:\n https://audioshake.github.io/jam-alt/; data:\n https://huggingface.co/datasets/audioshake/jam-alt; code:\n https://github.com/audioshake/alt-eval/"},{"id":"http://arxiv.org/abs/2311.13982v1","updated":"2023-11-23T12:52:37Z","published":"2023-11-23T12:52:37Z","title":"Probabilistic Tree-of-thought Reasoning for Answering\n Knowledge-intensive Complex Questions","summary":" Large language models (LLMs) are capable of answering knowledge-intensive\ncomplex questions with chain-of-thought (CoT) reasoning. However, they tend to\ngenerate factually incorrect reasoning steps when the required knowledge is not\navailable or up-to-date in models' parameters. Recent works turn to retrieving\nexternal knowledge to augment CoT reasoning. Despite being promising, these\nchain-based methods suffer from: 1) Negative retrieval. Unnecessary or\nincorrect retrieval may mislead the reasoning; 2) Limited sight. Lacking the\nability to look backward or forward, a local error in one step will propagate\nalong the chain.\n In this paper, we propose a novel approach: Probabilistic Tree-of-thought\nReasoning (ProbTree). First, LLMs translate a complex question into a query\ntree, in which each non-root node denotes a sub-question of its parent node.\nThen, probabilistic reasoning is conducted over the tree, by solving questions\nfrom leaf to root considering the confidence of both question decomposing and\nanswering. During reasoning, for leaf nodes, LLMs choose a more confident\nanswer from Closed-book QA that employs parametric knowledge and Open-book QA\nthat employs retrieved external knowledge, thus eliminating the negative\nretrieval problem. For non-leaf nodes, with the hierarchical structure, LLMs\nhave broader sights and are able to globally reason with the information from\nchild nodes, thus recovering from local errors. The experiments on three\nComplex QA datasets under the open-domain setting show that our approach\noutperforms SOTA methods significantly, demonstrating the effect of\nprobabilistic tree-of-thought reasoning.\n","authors":["Shulin Cao","Jiajie Zhang","Jiaxin Shi","Xin Lv","Zijun Yao","Qi Tian","Juanzi Li","Lei Hou"],"pdf_url":"https://arxiv.org/pdf/2311.13982v1.pdf","comment":"Accepted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.11462v2","updated":"2023-11-23T12:52:05Z","published":"2023-11-19T23:59:22Z","title":"LLM aided semi-supervision for Extractive Dialog Summarization","summary":" Generating high-quality summaries for chat dialogs often requires large\nlabeled datasets. We propose a method to efficiently use unlabeled data for\nextractive summarization of customer-agent dialogs. In our method, we frame\nsummarization as a question-answering problem and use state-of-the-art large\nlanguage models (LLMs) to generate pseudo-labels for a dialog. We then use\nthese pseudo-labels to fine-tune a chat summarization model, effectively\ntransferring knowledge from the large LLM into a smaller specialized model. We\ndemonstrate our method on the \\tweetsumm dataset, and show that using 10% of\nthe original labelled data set we can achieve 65.9/57.0/61.0 ROUGE-1/-2/-L,\nwhereas the current state-of-the-art trained on the entire training data set\nobtains 65.16/55.81/64.37 ROUGE-1/-2/-L. In other words, in the worst case\n(i.e., ROUGE-L) we still effectively retain 94.7% of the performance while\nusing only 10% of the data.\n","authors":["Nishant Mishra","Gaurav Sahu","Iacer Calixto","Ameen Abu-Hanna","Issam H. Laradji"],"pdf_url":"https://arxiv.org/pdf/2311.11462v2.pdf","comment":"to be published in EMNLP Findings"},{"id":"http://arxiv.org/abs/2311.10642v2","updated":"2023-11-23T12:47:26Z","published":"2023-11-17T16:58:52Z","title":"Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as\n an Alternative to Attention Layers in Transformers","summary":" This work presents an analysis of the effectiveness of using standard shallow\nfeed-forward networks to mimic the behavior of the attention mechanism in the\noriginal Transformer model, a state-of-the-art architecture for\nsequence-to-sequence tasks. We substitute key elements of the attention\nmechanism in the Transformer with simple feed-forward networks, trained using\nthe original components via knowledge distillation. Our experiments, conducted\non the IWSLT2017 dataset, reveal the capacity of these \"attentionless\nTransformers\" to rival the performance of the original architecture. Through\nrigorous ablation studies, and experimenting with various replacement network\ntypes and sizes, we offer insights that support the viability of our approach.\nThis not only sheds light on the adaptability of shallow feed-forward networks\nin emulating attention mechanisms but also underscores their potential to\nstreamline complex architectures for sequence-to-sequence tasks.\n","authors":["Vukasin Bozic","Danilo Dordevic","Daniele Coppola","Joseph Thommes","Sidak Pal Singh"],"pdf_url":"https://arxiv.org/pdf/2311.10642v2.pdf","comment":"Accepted at AAAI24(https://aaai.org/aaai-conference/)"},{"id":"http://arxiv.org/abs/2311.13957v1","updated":"2023-11-23T12:15:56Z","published":"2023-11-23T12:15:56Z","title":"Efficient Trigger Word Insertion","summary":" With the boom in the natural language processing (NLP) field these years,\nbackdoor attacks pose immense threats against deep neural network models.\nHowever, previous works hardly consider the effect of the poisoning rate. In\nthis paper, our main objective is to reduce the number of poisoned samples\nwhile still achieving a satisfactory Attack Success Rate (ASR) in text backdoor\nattacks. To accomplish this, we propose an efficient trigger word insertion\nstrategy in terms of trigger word optimization and poisoned sample selection.\nExtensive experiments on different datasets and models demonstrate that our\nproposed method can significantly improve attack effectiveness in text\nclassification tasks. Remarkably, our approach achieves an ASR of over 90% with\nonly 10 poisoned samples in the dirty-label setting and requires merely 1.5% of\nthe training data in the clean-label setting.\n","authors":["Yueqi Zeng","Ziqiang Li","Pengfei Xia","Lei Liu","Bin Li"],"pdf_url":"https://arxiv.org/pdf/2311.13957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10751v2","updated":"2023-11-23T12:14:08Z","published":"2023-11-02T14:32:16Z","title":"ProAgent: From Robotic Process Automation to Agentic Process Automation","summary":" From ancient water wheels to robotic process automation (RPA), automation\ntechnology has evolved throughout history to liberate human beings from arduous\ntasks. Yet, RPA struggles with tasks needing human-like intelligence,\nespecially in elaborate design of workflow construction and dynamic\ndecision-making in workflow execution. As Large Language Models (LLMs) have\nemerged human-like intelligence, this paper introduces Agentic Process\nAutomation (APA), a groundbreaking automation paradigm using LLM-based agents\nfor advanced automation by offloading the human labor to agents associated with\nconstruction and execution. We then instantiate ProAgent, an LLM-based agent\ndesigned to craft workflows from human instructions and make intricate\ndecisions by coordinating specialized agents. Empirical experiments are\nconducted to detail its construction and execution procedure of workflow,\nshowcasing the feasibility of APA, unveiling the possibility of a new paradigm\nof automation driven by agents. Our code is public at\nhttps://github.com/OpenBMB/ProAgent.\n","authors":["Yining Ye","Xin Cong","Shizuo Tian","Jiannan Cao","Hao Wang","Yujia Qin","Yaxi Lu","Heyang Yu","Huadong Wang","Yankai Lin","Zhiyuan Liu","Maosong Sun"],"pdf_url":"https://arxiv.org/pdf/2311.10751v2.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2311.05286v2","updated":"2023-11-23T12:12:22Z","published":"2023-11-09T11:29:44Z","title":"Causal Inference from Text: Unveiling Interactions between Variables","summary":" Adjusting for latent covariates is crucial for estimating causal effects from\nobservational textual data. Most existing methods only account for confounding\ncovariates that affect both treatment and outcome, potentially leading to\nbiased causal effects. This bias arises from insufficient consideration of\nnon-confounding covariates, which are relevant only to either the treatment or\nthe outcome. In this work, we aim to mitigate the bias by unveiling\ninteractions between different variables to disentangle the non-confounding\ncovariates when estimating causal effects from text. The disentangling process\nensures covariates only contribute to their respective objectives, enabling\nindependence between variables. Additionally, we impose a constraint to balance\nrepresentations from the treatment group and control group to alleviate\nselection bias. We conduct experiments on two different treatment factors under\nvarious scenarios, and the proposed model significantly outperforms recent\nstrong baselines. Furthermore, our thorough analysis on earnings call\ntranscripts demonstrates that our model can effectively disentangle the\nvariables, and further investigations into real-world scenarios provide\nguidance for investors to make informed decisions.\n","authors":["Yuxiang Zhou","Yulan He"],"pdf_url":"https://arxiv.org/pdf/2311.05286v2.pdf","comment":"EMNLP 2023 Findings (mark typo corrected)"},{"id":"http://arxiv.org/abs/2311.13951v1","updated":"2023-11-23T12:04:25Z","published":"2023-11-23T12:04:25Z","title":"MLLM-Bench, Evaluating Multi-modal LLMs using GPT-4V","summary":" In the pursuit of Artificial General Intelligence (AGI), the integration of\nvision in language models has marked a significant milestone. The advent of\nvision-language models (MLLMs) like GPT-4V have expanded AI applications,\naligning with the multi-modal capabilities of the human brain. However,\nevaluating the efficacy of MLLMs poses a substantial challenge due to the\nsubjective nature of tasks that lack definitive answers. Existing automatic\nevaluation methodologies on multi-modal large language models rely on objective\nqueries that have standard answers, inadequately addressing the nuances of\ncreative and associative multi-modal tasks. To address this, we introduce\nMLLM-Bench, an innovative benchmark inspired by Vicuna, spanning a diverse\narray of scenarios, including Perception, Understanding, Applying, Analyzing,\nEvaluating, and Creation along with the ethical consideration. MLLM-Bench is\ndesigned to reflect user experience more accurately and provide a more holistic\nassessment of model performance. Comparative evaluations indicate a significant\nperformance gap between existing open-source models and GPT-4V. We posit that\nMLLM-Bench will catalyze progress in the open-source community towards\ndeveloping user-centric vision-language models that meet a broad spectrum of\nreal-world applications. See online leaderboard in\n\\url{https://mllm-bench.llmzoo.com}.\n","authors":["Wentao Ge","Shunian Chen","Guiming Chen","Junying Chen","Zhihong Chen","Shuo Yan","Chenghao Zhu","Ziyue Lin","Wenya Xie","Xidong Wang","Anningzhe Gao","Zhiyi Zhang","Jianquan Li","Xiang Wan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13937v1","updated":"2023-11-23T11:40:28Z","published":"2023-11-23T11:40:28Z","title":"Exploring Methods for Cross-lingual Text Style Transfer: The Case of\n Text Detoxification","summary":" Text detoxification is the task of transferring the style of text from toxic\nto neutral. While here are approaches yielding promising results in monolingual\nsetup, e.g., (Dale et al., 2021; Hallinan et al., 2022), cross-lingual transfer\nfor this task remains a challenging open problem (Moskovskiy et al., 2022). In\nthis work, we present a large-scale study of strategies for cross-lingual text\ndetoxification -- given a parallel detoxification corpus for one language; the\ngoal is to transfer detoxification ability to another language for which we do\nnot have such a corpus. Moreover, we are the first to explore a new task where\ntext translation and detoxification are performed simultaneously, providing\nseveral strong baselines for this task. Finally, we introduce new automatic\ndetoxification evaluation metrics with higher correlations with human judgments\nthan previous benchmarks. We assess the most promising approaches also with\nmanual markup, determining the answer for the best strategy to transfer the\nknowledge of text detoxification between languages.\n","authors":["Daryna Dementieva","Daniil Moskovskiy","David Dale","Alexander Panchenko"],"pdf_url":"https://arxiv.org/pdf/2311.13937v1.pdf","comment":"AACL 2023, main conference, long paper"},{"id":"http://arxiv.org/abs/2311.13921v1","updated":"2023-11-23T11:14:13Z","published":"2023-11-23T11:14:13Z","title":"Some Like It Small: Czech Semantic Embedding Models for Industry\n Applications","summary":" This article focuses on the development and evaluation of Small-sized Czech\nsentence embedding models. Small models are important components for real-time\nindustry applications in resource-constrained environments. Given the limited\navailability of labeled Czech data, alternative approaches, including\npre-training, knowledge distillation, and unsupervised contrastive fine-tuning,\nare investigated. Comprehensive intrinsic and extrinsic analyses are conducted,\nshowcasing the competitive performance of our models compared to significantly\nlarger counterparts, with approximately 8 times smaller size and 5 times faster\nspeed than conventional Base-sized models. To promote cooperation and\nreproducibility, both the models and the evaluation pipeline are made publicly\naccessible. Ultimately, this article presents practical applications of the\ndeveloped sentence embedding models in Seznam.cz, the Czech search engine.\nThese models have effectively replaced previous counterparts, enhancing the\noverall search experience for instance, in organic search, featured snippets,\nand image search. This transition has yielded improved performance.\n","authors":["Jiří Bednář","Jakub Náplava","Petra Barančíková","Ondřej Lisický"],"pdf_url":"https://arxiv.org/pdf/2311.13921v1.pdf","comment":"Accepted at the Thirty-Sixth Annual Conference on Innovative\n Applications of Artificial Intelligence (IAAI-24). IAAI Innovative\n Application Award. 9 pages"},{"id":"http://arxiv.org/abs/2311.06622v2","updated":"2023-11-23T10:57:10Z","published":"2023-11-11T17:39:24Z","title":"TrainerAgent: Customizable and Efficient Model Training through\n LLM-Powered Multi-Agent System","summary":" Training AI models has always been challenging, especially when there is a\nneed for custom models to provide personalized services. Algorithm engineers\noften face a lengthy process to iteratively develop models tailored to specific\nbusiness requirements, making it even more difficult for non-experts. The quest\nfor high-quality and efficient model development, along with the emergence of\nLarge Language Model (LLM) Agents, has become a key focus in the industry.\nLeveraging the powerful analytical, planning, and decision-making capabilities\nof LLM, we propose a TrainerAgent system comprising a multi-agent framework\nincluding Task, Data, Model and Server agents. These agents analyze\nuser-defined tasks, input data, and requirements (e.g., accuracy, speed),\noptimizing them comprehensively from both data and model perspectives to obtain\nsatisfactory models, and finally deploy these models as online service.\nExperimental evaluations on classical discriminative and generative tasks in\ncomputer vision and natural language processing domains demonstrate that our\nsystem consistently produces models that meet the desired criteria.\nFurthermore, the system exhibits the ability to critically identify and reject\nunattainable tasks, such as fantastical scenarios or unethical requests,\nensuring robustness and safety. This research presents a significant\nadvancement in achieving desired models with increased efficiency and quality\nas compared to traditional model development, facilitated by the integration of\nLLM-powered analysis, decision-making, and execution capabilities, as well as\nthe collaboration among four agents. We anticipate that our work will\ncontribute to the advancement of research on TrainerAgent in both academic and\nindustry communities, potentially establishing it as a new paradigm for model\ndevelopment in the field of AI.\n","authors":["Haoyuan Li","Hao Jiang","Tianke Zhang","Zhelun Yu","Aoxiong Yin","Hao Cheng","Siming Fu","Yuhao Zhang","Wanggui He"],"pdf_url":"https://arxiv.org/pdf/2311.06622v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13910v1","updated":"2023-11-23T10:56:14Z","published":"2023-11-23T10:56:14Z","title":"Dialogue Quality and Emotion Annotations for Customer Support\n Conversations","summary":" Task-oriented conversational datasets often lack topic variability and\nlinguistic diversity. However, with the advent of Large Language Models (LLMs)\npretrained on extensive, multilingual and diverse text data, these limitations\nseem overcome. Nevertheless, their generalisability to different languages and\ndomains in dialogue applications remains uncertain without benchmarking\ndatasets. This paper presents a holistic annotation approach for emotion and\nconversational quality in the context of bilingual customer support\nconversations. By performing annotations that take into consideration the\ncomplete instances that compose a conversation, one can form a broader\nperspective of the dialogue as a whole. Furthermore, it provides a unique and\nvaluable resource for the development of text classification models. To this\nend, we present benchmarks for Emotion Recognition and Dialogue Quality\nEstimation and show that further research is needed to leverage these models in\na production setting.\n","authors":["John Mendonça","Patrícia Pereira","Miguel Menezes","Vera Cabarrão","Ana C. Farinha","Helena Moniz","João Paulo Carvalho","Alon Lavie","Isabel Trancoso"],"pdf_url":"https://arxiv.org/pdf/2311.13910v1.pdf","comment":"Accepted at GEM (EMNLP Workshop)"},{"id":"http://arxiv.org/abs/2311.13892v1","updated":"2023-11-23T10:23:51Z","published":"2023-11-23T10:23:51Z","title":"General Phrase Debiaser: Debiasing Masked Language Models at a\n Multi-Token Level","summary":" The social biases and unwelcome stereotypes revealed by pretrained language\nmodels are becoming obstacles to their application. Compared to numerous\ndebiasing methods targeting word level, there has been relatively less\nattention on biases present at phrase level, limiting the performance of\ndebiasing in discipline domains. In this paper, we propose an automatic\nmulti-token debiasing pipeline called \\textbf{General Phrase Debiaser}, which\nis capable of mitigating phrase-level biases in masked language models.\nSpecifically, our method consists of a \\textit{phrase filter stage} that\ngenerates stereotypical phrases from Wikipedia pages as well as a \\textit{model\ndebias stage} that can debias models at the multi-token level to tackle bias\nchallenges on phrases. The latter searches for prompts that trigger model's\nbias, and then uses them for debiasing. State-of-the-art results on standard\ndatasets and metrics show that our approach can significantly reduce gender\nbiases on both career and multiple disciplines, across models with varying\nparameter sizes.\n","authors":["Bingkang Shi","Xiaodan Zhang","Dehan Kong","Yulei Wu","Zongzhen Liu","Honglei Lyu","Longtao Huang"],"pdf_url":"https://arxiv.org/pdf/2311.13892v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06025v2","updated":"2023-11-23T10:19:47Z","published":"2023-11-10T12:25:32Z","title":"ChiMed-GPT: A Chinese Medical Large Language Model with Full Training\n Regime and Better Alignment to Human Preferences","summary":" Recently, the increasing demand for superior medical services has highlighted\nthe discrepancies in the medical infrastructure. With big data, especially\ntexts, forming the foundation of medical services, there is an exigent need for\neffective natural language processing (NLP) solutions tailored to the\nhealthcare domain. Conventional approaches leveraging pre-trained models\npresent promising results in this domain and current large language models\n(LLMs) offer advanced foundation for medical text processing. However, most\nmedical LLMs are trained only with supervised fine-tuning (SFT), even though it\nefficiently empowers LLMs to understand and respond to medical instructions but\nis ineffective in learning domain knowledge and aligning with human preference.\nAnother engineering barrier that prevents current medical LLM from better text\nprocessing ability is their restricted context length (e.g., 2,048 tokens),\nmaking it hard for the LLMs to process long context, which is frequently\nrequired in the medical domain. In this work, we propose ChiMed-GPT, a new\nbenchmark LLM designed explicitly for Chinese medical domain, with enlarged\ncontext length to 4,096 tokens and undergoes a comprehensive training regime\nwith pre-training, SFT, and RLHF. Evaluations on real-world tasks including\ninformation extraction, question answering, and dialogue generation demonstrate\nChiMed-GPT's superior performance over general domain LLMs. Furthermore, we\nanalyze possible biases through prompting ChiMed-GPT to perform attitude scales\nregarding discrimination of patients, so as to contribute to further\nresponsible development of LLMs in the medical domain. The code and model are\nreleased at https://github.com/synlp/ChiMed-GPT.\n","authors":["Yuanhe Tian","Ruyi Gan","Yan Song","Jiaxing Zhang","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.06025v2.pdf","comment":"17 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.13878v1","updated":"2023-11-23T09:58:39Z","published":"2023-11-23T09:58:39Z","title":"Minimizing Factual Inconsistency and Hallucination in Large Language\n Models","summary":" Large Language Models (LLMs) are widely used in critical fields such as\nhealthcare, education, and finance due to their remarkable proficiency in\nvarious language-related tasks. However, LLMs are prone to generating factually\nincorrect responses or \"hallucinations,\" which can lead to a loss of\ncredibility and trust among users. To address this issue, we propose a\nmulti-stage framework that generates the rationale first, verifies and refines\nincorrect ones, and uses them as supporting references to generate the answer.\nThe generated rationale enhances the transparency of the answer and our\nframework provides insights into how the model arrived at this answer, by using\nthis rationale and the references to the context. In this paper, we demonstrate\nits effectiveness in improving the quality of responses to drug-related\ninquiries in the life sciences industry. Our framework improves traditional\nRetrieval Augmented Generation (RAG) by enabling OpenAI GPT-3.5-turbo to be\n14-25% more faithful and 16-22% more accurate on two datasets. Furthermore,\nfine-tuning samples based on our framework improves the accuracy of smaller\nopen-access LLMs by 33-42% and competes with RAG on commercial models.\n","authors":["Muneeswaran I","Shreya Saxena","Siva Prasad","M V Sai Prakash","Advaith Shankar","Varun V","Vishal Vaddina","Saisubramaniam Gopalakrishnan"],"pdf_url":"https://arxiv.org/pdf/2311.13878v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13857v1","updated":"2023-11-23T08:56:41Z","published":"2023-11-23T08:56:41Z","title":"Challenges of Large Language Models for Mental Health Counseling","summary":" The global mental health crisis is looming with a rapid increase in mental\ndisorders, limited resources, and the social stigma of seeking treatment. As\nthe field of artificial intelligence (AI) has witnessed significant\nadvancements in recent years, large language models (LLMs) capable of\nunderstanding and generating human-like text may be used in supporting or\nproviding psychological counseling. However, the application of LLMs in the\nmental health domain raises concerns regarding the accuracy, effectiveness, and\nreliability of the information provided. This paper investigates the major\nchallenges associated with the development of LLMs for psychological\ncounseling, including model hallucination, interpretability, bias, privacy, and\nclinical effectiveness. We explore potential solutions to these challenges that\nare practical and applicable to the current paradigm of AI. From our experience\nin developing and deploying LLMs for mental health, AI holds a great promise\nfor improving mental health care, if we can carefully navigate and overcome\npitfalls of LLMs.\n","authors":["Neo Christopher Chung","George Dyer","Lennart Brocki"],"pdf_url":"https://arxiv.org/pdf/2311.13857v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13848v1","updated":"2023-11-23T08:34:37Z","published":"2023-11-23T08:34:37Z","title":"Grammatical Error Correction via Mixed-Grained Weighted Training","summary":" The task of Grammatical Error Correction (GEC) aims to automatically correct\ngrammatical errors in natural texts. Almost all previous works treat annotated\ntraining data equally, but inherent discrepancies in data are neglected. In\nthis paper, the inherent discrepancies are manifested in two aspects, namely,\naccuracy of data annotation and diversity of potential annotations. To this\nend, we propose MainGEC, which designs token-level and sentence-level training\nweights based on inherent discrepancies in accuracy and potential diversity of\ndata annotation, respectively, and then conducts mixed-grained weighted\ntraining to improve the training effect for GEC. Empirical evaluation shows\nthat whether in the Seq2Seq or Seq2Edit manner, MainGEC achieves consistent and\nsignificant performance improvements on two benchmark datasets, demonstrating\nthe effectiveness and superiority of the mixed-grained weighted training.\nFurther ablation experiments verify the effectiveness of designed weights of\nboth granularities in MainGEC.\n","authors":["Jiahao Li","Quan Wang","Chiwei Zhu","Zhendong Mao","Yongdong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13848v1.pdf","comment":"EMNLP2023 Findings"},{"id":"http://arxiv.org/abs/2311.13833v1","updated":"2023-11-23T07:33:38Z","published":"2023-11-23T07:33:38Z","title":"Lego: Learning to Disentangle and Invert Concepts Beyond Object\n Appearance in Text-to-Image Diffusion Models","summary":" Diffusion models have revolutionized generative content creation and\ntext-to-image (T2I) diffusion models in particular have increased the creative\nfreedom of users by allowing scene synthesis using natural language. T2I models\nexcel at synthesizing concepts such as nouns, appearances, and styles. To\nenable customized content creation based on a few example images of a concept,\nmethods such as Textual Inversion and DreamBooth invert the desired concept and\nenable synthesizing it in new scenes. However, inverting more general concepts\nthat go beyond object appearance and style (adjectives and verbs) through\nnatural language, remains a challenge. Two key characteristics of these\nconcepts contribute to the limitations of current inversion methods. 1)\nAdjectives and verbs are entangled with nouns (subject) and can hinder\nappearance-based inversion methods, where the subject appearance leaks into the\nconcept embedding and 2) describing such concepts often extends beyond single\nword embeddings (being frozen in ice, walking on a tightrope, etc.) that\ncurrent methods do not handle.\n In this study, we introduce Lego, a textual inversion method designed to\ninvert subject entangled concepts from a few example images. Lego disentangles\nconcepts from their associated subjects using a simple yet effective Subject\nSeparation step and employs a Context Loss that guides the inversion of\nsingle/multi-embedding concepts. In a thorough user study, Lego-generated\nconcepts were preferred over 70% of the time when compared to the baseline.\nAdditionally, visual question answering using a large language model suggested\nLego-generated concepts are better aligned with the text description of the\nconcept.\n","authors":["Saman Motamed","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.13833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08899v2","updated":"2023-11-23T06:08:59Z","published":"2023-10-13T07:03:39Z","title":"Exploration with Principles for Diverse AI Supervision","summary":" Training large transformers using next-token prediction has given rise to\ngroundbreaking advancements in AI. While this generative AI approach has\nproduced impressive results, it heavily leans on human supervision. Even\nstate-of-the-art AI models like ChatGPT depend on fine-tuning through human\ndemonstrations, demanding extensive human input and domain expertise. This\nstrong reliance on human oversight poses a significant hurdle to the\nadvancement of AI innovation. To address this limitation, we propose a novel\nparadigm termed Exploratory AI (EAI) aimed at autonomously generating\nhigh-quality training data. Drawing inspiration from unsupervised reinforcement\nlearning (RL) pretraining, EAI achieves exploration within the natural language\nspace. We accomplish this by harnessing large language models to assess the\nnovelty of generated content. Our approach employs two key components: an actor\nthat generates novel content following exploration principles and a critic that\nevaluates the generated content, offering critiques to guide the actor.\nEmpirical evaluations demonstrate that EAI significantly boosts model\nperformance on complex reasoning tasks, addressing the limitations of\nhuman-intensive supervision.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.08899v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13806v1","updated":"2023-11-23T04:42:27Z","published":"2023-11-23T04:42:27Z","title":"AdaTyper: Adaptive Semantic Column Type Detection","summary":" Understanding the semantics of relational tables is instrumental for\nautomation in data exploration and preparation systems. A key source for\nunderstanding a table is the semantics of its columns. With the rise of deep\nlearning, learned table representations are now available, which can be applied\nfor semantic type detection and achieve good performance on benchmarks.\nNevertheless, we observe a gap between this performance and its applicability\nin practice. In this paper, we propose AdaTyper to address one of the most\ncritical deployment challenges: adaptation. AdaTyper uses weak-supervision to\nadapt a hybrid type predictor towards new semantic types and shifted data\ndistributions at inference time, using minimal human feedback. The hybrid type\npredictor of AdaTyper combines rule-based methods and a light machine learning\nmodel for semantic column type detection. We evaluate the adaptation\nperformance of AdaTyper on real-world database tables hand-annotated with\nsemantic column types through crowdsourcing and find that the f1-score improves\nfor new and existing types. AdaTyper approaches an average precision of 0.6\nafter only seeing 5 examples, significantly outperforming existing adaptation\nmethods based on human-provided regular expressions or dictionaries.\n","authors":["Madelon Hulsebos","Paul Groth","Çağatay Demiralp"],"pdf_url":"https://arxiv.org/pdf/2311.13806v1.pdf","comment":"Submitted to VLDB'24"},{"id":"http://arxiv.org/abs/2311.13784v1","updated":"2023-11-23T03:03:54Z","published":"2023-11-23T03:03:54Z","title":"DaG LLM ver 1.0: Pioneering Instruction-Tuned Language Modeling for\n Korean NLP","summary":" This paper presents the DaG LLM (David and Goliath Large Language Model), a\nlanguage model specialized for Korean and fine-tuned through Instruction Tuning\nacross 41 tasks within 13 distinct categories.\n","authors":["Dongjun Jang","Sangah Lee","Sungjoo Byun","Jinwoong Kim","Jean Seo","Minseok Kim","Soyeon Kim","Chaeyoung Oh","Jaeyoon Kim","Hyemi Jo","Hyopil Shin"],"pdf_url":"https://arxiv.org/pdf/2311.13784v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00648v3","updated":"2023-11-23T02:33:36Z","published":"2023-10-01T12:07:44Z","title":"Fewer is More: Trojan Attacks on Parameter-Efficient Fine-Tuning","summary":" Parameter-efficient fine-tuning (PEFT) enables efficient adaptation of\npre-trained language models (PLMs) to specific tasks. By tuning only a minimal\nset of (extra) parameters, PEFT achieves performance comparable to full\nfine-tuning. However, despite its prevalent use, the security implications of\nPEFT remain largely unexplored. In this paper, we conduct a pilot study\nrevealing that PEFT exhibits unique vulnerability to trojan attacks.\nSpecifically, we present PETA, a novel attack that accounts for downstream\nadaptation through bilevel optimization: the upper-level objective embeds the\nbackdoor into a PLM while the lower-level objective simulates PEFT to retain\nthe PLM's task-specific performance. With extensive evaluation across a variety\nof downstream tasks and trigger designs, we demonstrate PETA's effectiveness in\nterms of both attack success rate and unaffected clean accuracy, even after the\nvictim user performs PEFT over the backdoored PLM using untainted data.\nMoreover, we empirically provide possible explanations for PETA's efficacy: the\nbilevel optimization inherently 'orthogonalizes' the backdoor and PEFT modules,\nthereby retaining the backdoor throughout PEFT. Based on this insight, we\nexplore a simple defense that omits PEFT in selected layers of the backdoored\nPLM and unfreezes a subset of these layers' parameters, which is shown to\neffectively neutralize PETA.\n","authors":["Lauren Hong","Ting Wang"],"pdf_url":"https://arxiv.org/pdf/2310.00648v3.pdf","comment":"20 pages, 6 figures"},{"id":"http://arxiv.org/abs/2309.17169v2","updated":"2023-11-23T02:06:20Z","published":"2023-09-29T12:06:55Z","title":"An evaluation of GPT models for phenotype concept recognition","summary":" Objective: Clinical deep phenotyping and phenotype annotation play a critical\nrole in both the diagnosis of patients with rare disorders as well as in\nbuilding computationally-tractable knowledge in the rare disorders field. These\nprocesses rely on using ontology concepts, often from the Human Phenotype\nOntology, in conjunction with a phenotype concept recognition task (supported\nusually by machine learning methods) to curate patient profiles or existing\nscientific literature. With the significant shift in the use of large language\nmodels (LLMs) for most NLP tasks, we examine the performance of the latest\nGenerative Pre-trained Transformer (GPT) models underpinning ChatGPT as a\nfoundation for the tasks of clinical phenotyping and phenotype annotation.\nMaterials and Methods: The experimental setup of the study included seven\nprompts of various levels of specificity, two GPT models (gpt-3.5-turbo and\ngpt-4.0) and two established gold standard corpora for phenotype recognition,\none consisting of publication abstracts and the other clinical observations.\nResults: Our results show that, with an appropriate setup, these models can\nachieve state of the art performance. The best run, using few-shot learning,\nachieved 0.58 macro F1 score on publication abstracts and 0.75 macro F1 score\non clinical observations, the former being comparable with the state of the\nart, while the latter surpassing the current best in class tool. Conclusion:\nWhile the results are promising, the non-deterministic nature of the outcomes,\nthe high cost and the lack of concordance between different runs using the same\nprompt and input make the use of these LLMs challenging for this particular\ntask.\n","authors":["Tudor Groza","Harry Caufield","Dylan Gration","Gareth Baynam","Melissa A Haendel","Peter N Robinson","Christopher J Mungall","Justin T Reese"],"pdf_url":"https://arxiv.org/pdf/2309.17169v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13755v1","updated":"2023-11-23T01:06:08Z","published":"2023-11-23T01:06:08Z","title":"Transformer-based Named Entity Recognition in Construction Supply Chain\n Risk Management in Australia","summary":" The construction industry in Australia is characterized by its intricate\nsupply chains and vulnerability to myriad risks. As such, effective supply\nchain risk management (SCRM) becomes imperative. This paper employs different\ntransformer models, and train for Named Entity Recognition (NER) in the context\nof Australian construction SCRM. Utilizing NER, transformer models identify and\nclassify specific risk-associated entities in news articles, offering a\ndetailed insight into supply chain vulnerabilities. By analysing news articles\nthrough different transformer models, we can extract relevant entities and\ninsights related to specific risk taxonomies local (milieu) to the Australian\nconstruction landscape. This research emphasises the potential of NLP-driven\nsolutions, like transformer models, in revolutionising SCRM for construction in\ngeo-media specific contexts.\n","authors":["Milad Baghalzadeh Shishehgarkhaneh","Robert C. Moehler","Yihai Fang","Amer A. Hijazi","Hamed Aboutorab"],"pdf_url":"https://arxiv.org/pdf/2311.13755v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be acceptable"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.14227v1","updated":"2023-11-23T23:40:01Z","published":"2023-11-23T23:40:01Z","title":"Robust and Interpretable COVID-19 Diagnosis on Chest X-ray Images using\n Adversarial Training","summary":" The novel 2019 Coronavirus disease (COVID-19) global pandemic is a defining\nhealth crisis. Recent efforts have been increasingly directed towards achieving\nquick and accurate detection of COVID-19 across symptomatic patients to\nmitigate the intensity and spread of the disease. Artificial intelligence (AI)\nalgorithms applied to chest X-ray (CXR) images have emerged as promising\ndiagnostic tools, and previous work has demonstrated impressive classification\nperformances. However, such methods have faced criticisms from physicians due\nto their black-box reasoning process and unpredictable nature. In contrast to\nprofessional radiologist diagnosis, AI systems often lack generalizability,\nexplainability, and robustness in the clinical decision making process. In our\nwork, we address these issues by first proposing an extensive baseline study,\ntraining and evaluating 21 convolutional neural network (CNN) models on a\ndiverse set of 33,000+ CXR images to classify between healthy, COVID-19, and\nnon-COVID-19 pneumonia CXRs. Our resulting models achieved a 3-way\nclassification accuracy, recall, and precision of up to 97.03\\%, 97.97\\%, and\n99.95\\%, respectively. Next, we investigate the effectiveness of adversarial\ntraining on model robustness and explainability via Gradient-weighted Class\nActivation Mapping (Grad-CAM) heatmaps. We find that adversarially trained\nmodels not only significantly outperform their standard counterparts on\nclassifying perturbed images, but also yield saliency maps that 1) better\nspecify clinically relevant features, 2) are robust against extraneous\nartifacts, and 3) agree considerably more with expert radiologist findings.\n","authors":["Karina Yang","Alexis Bennett","Dominique Duncan"],"pdf_url":"https://arxiv.org/pdf/2311.14227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14218v1","updated":"2023-11-23T22:27:31Z","published":"2023-11-23T22:27:31Z","title":"A New Benchmark and Model for Challenging Image Manipulation Detection","summary":" The ability to detect manipulation in multimedia data is vital in digital\nforensics. Existing Image Manipulation Detection (IMD) methods are mainly based\non detecting anomalous features arisen from image editing or double compression\nartifacts. All existing IMD techniques encounter challenges when it comes to\ndetecting small tampered regions from a large image. Moreover,\ncompression-based IMD approaches face difficulties in cases of double\ncompression of identical quality factors. To investigate the State-of-The-Art\n(SoTA) IMD methods in those challenging conditions, we introduce a new\nChallenging Image Manipulation Detection (CIMD) benchmark dataset, which\nconsists of two subsets, for evaluating editing-based and compression-based IMD\nmethods, respectively. The dataset images were manually taken and tampered with\nhigh-quality annotations. In addition, we propose a new two-branch network\nmodel based on HRNet that can better detect both the image-editing and\ncompression artifacts in those challenging conditions. Extensive experiments on\nthe CIMD benchmark show that our model significantly outperforms SoTA IMD\nmethods on CIMD.\n","authors":["Zhenfei Zhang","Mingyang Li","Ming-Ching Chang"],"pdf_url":"https://arxiv.org/pdf/2311.14218v1.pdf","comment":"8 pages, 6 figures, 3 tabels"},{"id":"http://arxiv.org/abs/2209.05954v4","updated":"2023-11-23T22:11:49Z","published":"2022-09-09T23:18:31Z","title":"Automatically Score Tissue Images Like a Pathologist by Transfer\n Learning","summary":" Cancer is the second leading cause of death in the world. Diagnosing cancer\nearly on can save many lives. Pathologists have to look at tissue microarray\n(TMA) images manually to identify tumors, which can be time-consuming,\ninconsistent and subjective. Existing automatic algorithms either have not\nachieved the accuracy level of a pathologist or require substantial human\ninvolvements. A major challenge is that TMA images with different shapes,\nsizes, and locations can have the same score. Learning staining patterns in TMA\nimages requires a huge number of images, which are severely limited due to\nprivacy and regulation concerns in medical organizations. TMA images from\ndifferent cancer types may share certain common characteristics, but combining\nthem directly harms the accuracy due to heterogeneity in their staining\npatterns. Transfer learning is an emerging learning paradigm that allows\nborrowing strength from similar problems. However, existing approaches\ntypically require a large sample from similar learning problems, while TMA\nimages of different cancer types are often available in small sample size and\nfurther existing algorithms are limited to transfer learning from one similar\nproblem. We propose a new transfer learning algorithm that could learn from\nmultiple related problems, where each problem has a small sample and can have a\nsubstantially different distribution from the original one. The proposed\nalgorithm has made it possible to break the critical accuracy barrier (the 75%\naccuracy level of pathologists), with a reported accuracy of 75.9% on breast\ncancer TMA images from the Stanford Tissue Microarray Database. It is supported\nby recent developments in transfer learning theory and empirical evidence in\nclustering technology. This will allow pathologists to confidently adopt\nautomatic algorithms in recognizing tumors consistently with a higher accuracy\nin real time.\n","authors":["Iris Yan"],"pdf_url":"https://arxiv.org/pdf/2209.05954v4.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.14208v1","updated":"2023-11-23T21:23:52Z","published":"2023-11-23T21:23:52Z","title":"ECRF: Entropy-Constrained Neural Radiance Fields Compression with\n Frequency Domain Optimization","summary":" Explicit feature-grid based NeRF models have shown promising results in terms\nof rendering quality and significant speed-up in training. However, these\nmethods often require a significant amount of data to represent a single scene\nor object. In this work, we present a compression model that aims to minimize\nthe entropy in the frequency domain in order to effectively reduce the data\nsize. First, we propose using the discrete cosine transform (DCT) on the\ntensorial radiance fields to compress the feature-grid. This feature-grid is\ntransformed into coefficients, which are then quantized and entropy encoded,\nfollowing a similar approach to the traditional video coding pipeline.\nFurthermore, to achieve a higher level of sparsity, we propose using an entropy\nparameterization technique for the frequency domain, specifically for DCT\ncoefficients of the feature-grid. Since the transformed coefficients are\noptimized during the training phase, the proposed model does not require any\nfine-tuning or additional information. Our model only requires a lightweight\ncompression pipeline for encoding and decoding, making it easier to apply\nvolumetric radiance field methods for real-world applications. Experimental\nresults demonstrate that our proposed frequency domain entropy model can\nachieve superior compression performance across various datasets. The source\ncode will be made publicly available.\n","authors":["Soonbin Lee","Fangwen Shu","Yago Sanchez","Thomas Schierl","Cornelius Hellge"],"pdf_url":"https://arxiv.org/pdf/2311.14208v1.pdf","comment":"10 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.14199v1","updated":"2023-11-23T20:52:44Z","published":"2023-11-23T20:52:44Z","title":"A Systematic Review of Deep Learning-based Research on Radiology Report\n Generation","summary":" Radiology report generation (RRG) aims to automatically generate free-text\ndescriptions from clinical radiographs, e.g., chest X-Ray images. RRG plays an\nessential role in promoting clinical automation and presents significant help\nto provide practical assistance for inexperienced doctors and alleviate\nradiologists' workloads. Therefore, consider these meaningful potentials,\nresearch on RRG is experiencing explosive growth in the past half-decade,\nespecially with the rapid development of deep learning approaches. Existing\nstudies perform RRG from the perspective of enhancing different modalities,\nprovide insights on optimizing the report generation process with elaborated\nfeatures from both visual and textual information, and further facilitate RRG\nwith the cross-modal interactions among them. In this paper, we present a\ncomprehensive review of deep learning-based RRG from various perspectives.\nSpecifically, we firstly cover pivotal RRG approaches based on the\ntask-specific features of radiographs, reports, and the cross-modal relations\nbetween them, and then illustrate the benchmark datasets conventionally used\nfor this task with evaluation metrics, subsequently analyze the performance of\ndifferent approaches and finally offer our summary on the challenges and the\ntrends in future directions. Overall, the goal of this paper is to serve as a\ntool for understanding existing literature and inspiring potential valuable\nresearch in the field of RRG.\n","authors":["Chang Liu","Yuanhe Tian","Yan Song"],"pdf_url":"https://arxiv.org/pdf/2311.14199v1.pdf","comment":"26 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.05784v2","updated":"2023-11-23T20:45:53Z","published":"2023-11-09T23:25:29Z","title":"Are \"Hierarchical\" Visual Representations Hierarchical?","summary":" Learned visual representations often capture large amounts of semantic\ninformation for accurate downstream applications. Human understanding of the\nworld is fundamentally grounded in hierarchy. To mimic this and further improve\nrepresentation capabilities, the community has explored \"hierarchical\" visual\nrepresentations that aim at modeling the underlying hierarchy of the visual\nworld. In this work, we set out to investigate if hierarchical visual\nrepresentations truly capture the human perceived hierarchy better than\nstandard learned representations. To this end, we create HierNet, a suite of 12\ndatasets spanning 3 kinds of hierarchy from the BREEDs subset of ImageNet.\nAfter extensive evaluation of Hyperbolic and Matryoshka Representations across\ntraining setups, we conclude that they do not capture hierarchy any better than\nthe standard representations but can assist in other aspects like search\nefficiency and interpretability. Our benchmark and the datasets are\nopen-sourced at https://github.com/ethanlshen/HierNet.\n","authors":["Ethan Shen","Ali Farhadi","Aditya Kusupati"],"pdf_url":"https://arxiv.org/pdf/2311.05784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14197v1","updated":"2023-11-23T20:41:46Z","published":"2023-11-23T20:41:46Z","title":"Enhancing mTBI Diagnosis with Residual Triplet Convolutional Neural\n Network Using 3D CT","summary":" Mild Traumatic Brain Injury (mTBI) is a common and challenging condition to\ndiagnose accurately. Timely and precise diagnosis is essential for effective\ntreatment and improved patient outcomes. Traditional diagnostic methods for\nmTBI often have limitations in terms of accuracy and sensitivity. In this\nstudy, we introduce an innovative approach to enhance mTBI diagnosis using 3D\nComputed Tomography (CT) images and a metric learning technique trained with\ntriplet loss. To address these challenges, we propose a Residual Triplet\nConvolutional Neural Network (RTCNN) model to distinguish between mTBI cases\nand healthy ones by embedding 3D CT scans into a feature space. The triplet\nloss function maximizes the margin between similar and dissimilar image pairs,\noptimizing feature representations. This facilitates better context placement\nof individual cases, aids informed decision-making, and has the potential to\nimprove patient outcomes. Our RTCNN model shows promising performance in mTBI\ndiagnosis, achieving an average accuracy of 94.3%, a sensitivity of 94.1%, and\na specificity of 95.2%, as confirmed through a five-fold cross-validation.\nImportantly, when compared to the conventional Residual Convolutional Neural\nNetwork (RCNN) model, the RTCNN exhibits a significant improvement, showcasing\na remarkable 22.5% increase in specificity, a notable 16.2% boost in accuracy,\nand an 11.3% enhancement in sensitivity. Moreover, RTCNN requires lower memory\nresources, making it not only highly effective but also resource-efficient in\nminimizing false positives while maximizing its diagnostic accuracy in\ndistinguishing normal CT scans from mTBI cases. The quantitative performance\nmetrics provided and utilization of occlusion sensitivity maps to visually\nexplain the model's decision-making process further enhance the\ninterpretability and transparency of our approach.\n","authors":["Hanem Ellethy","Shekhar S. Chandra","Viktor Vegh"],"pdf_url":"https://arxiv.org/pdf/2311.14197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14189v1","updated":"2023-11-23T20:14:50Z","published":"2023-11-23T20:14:50Z","title":"HACD: Hand-Aware Conditional Diffusion for Monocular Hand-Held Object\n Reconstruction","summary":" Reconstructing hand-held objects from a single RGB image without known 3D\nobject templates, category prior, or depth information is a vital yet\nchallenging problem in computer vision. In contrast to prior works that utilize\ndeterministic modeling paradigms, which make it hard to account for the\nuncertainties introduced by hand- and self-occlusion, we employ a probabilistic\npoint cloud denoising diffusion model to tackle the above challenge. In this\nwork, we present Hand-Aware Conditional Diffusion for monocular hand-held\nobject reconstruction (HACD), modeling the hand-object interaction in two\naspects. First, we introduce hand-aware conditioning to model hand-object\ninteraction from both semantic and geometric perspectives. Specifically, a\nunified hand-object semantic embedding compensates for the 2D local feature\ndeficiency induced by hand occlusion, and a hand articulation embedding further\nencodes the relationship between object vertices and hand joints. Second, we\npropose a hand-constrained centroid fixing scheme, which utilizes hand vertices\npriors to restrict the centroid deviation of partially denoised point cloud\nduring diffusion and reverse process. Removing the centroid bias interference\nallows the diffusion models to focus on the reconstruction of shape, thus\nenhancing the stability and precision of local feature projection. Experiments\non the synthetic ObMan dataset and two real-world datasets, HO3D and MOW,\ndemonstrate our approach surpasses all existing methods by a large margin.\n","authors":["Bowen Fu","Yan Di","Chenyangguang Zhang","Gu Wang","Ziqin Huang","Zhiying Leng","Fabian Manhardt","Xiangyang Ji","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2311.14189v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10754v2","updated":"2023-11-23T20:08:49Z","published":"2023-11-06T18:18:42Z","title":"A Recent Survey of the Advancements in Deep Learning Techniques for\n Monkeypox Disease Detection","summary":" Monkeypox (MPox) is a zoonotic infectious disease induced by the MPox Virus,\npart of the poxviridae orthopoxvirus group initially discovered in Africa and\ngained global attention in mid-2022 with cases reported outside endemic areas.\nSymptoms include headaches, chills, fever, smallpox, measles, and\nchickenpox-like skin manifestations and the WHO officially announced MPox as a\nglobal public health pandemic, in July 2022.Traditionally, PCR testing of skin\nlesions is considered a benchmark for the primary diagnosis by WHO, with\nsymptom management as the primary treatment and antiviral drugs like\ntecovirimat for severe cases. However, manual analysis within hospitals poses a\nsubstantial challenge including the substantial burden on healthcare\nprofessionals, limited facilities, availability and fatigue among doctors, and\nhuman error during public health emergencies. Therefore, this survey paper\nprovides an extensive and efficient analysis of deep learning (DL) methods for\nthe automatic detection of MPox in skin lesion images. These DL techniques are\nbroadly grouped into categories, including deep CNN, Deep CNNs ensemble, deep\nhybrid learning, the newly developed, and Vision transformer for diagnosing\nMPox. Moreover, this study offers a systematic exploration of the evolutionary\nprogression of DL techniques and identifies, and addresses limitations in\nprevious methods while highlighting the valuable contributions and innovation.\nAdditionally, the paper addresses benchmark datasets and their collection from\nvarious authentic sources, pre-processing techniques, and evaluation metrics.\nThe survey also briefly delves into emerging concepts, identifies research\ngaps, limitations, and applications, and outlines challenges in the diagnosis\nprocess. This survey furnishes valuable insights into the prospective areas of\nDL innovative ideas and is anticipated to serve as a path for researchers.\n","authors":["Saddam Hussain Khan","Rashid Iqbal","Saeeda Naz"],"pdf_url":"https://arxiv.org/pdf/2311.10754v2.pdf","comment":"53 pages, 16 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.14177v1","updated":"2023-11-23T19:49:59Z","published":"2023-11-23T19:49:59Z","title":"TCuPGAN: A novel framework developed for optimizing human-machine\n interactions in citizen science","summary":" In the era of big data in scientific research, there is a necessity to\nleverage techniques which reduce human effort in labeling and categorizing\nlarge datasets by involving sophisticated machine tools. To combat this\nproblem, we present a novel, general purpose model for 3D segmentation that\nleverages patch-wise adversariality and Long Short-Term Memory to encode\nsequential information. Using this model alongside citizen science projects\nwhich use 3D datasets (image cubes) on the Zooniverse platforms, we propose an\niterative human-machine optimization framework where only a fraction of the 2D\nslices from these cubes are seen by the volunteers. We leverage the patch-wise\ndiscriminator in our model to provide an estimate of which slices within these\nimage cubes have poorly generalized feature representations, and\ncorrespondingly poor machine performance. These images with corresponding\nmachine proposals would be presented to volunteers on Zooniverse for\ncorrection, leading to a drastic reduction in the volunteer effort on citizen\nscience projects. We trained our model on ~2300 liver tissue 3D electron\nmicrographs. Lipid droplets were segmented within these images through human\nannotation via the `Etch A Cell - Fat Checker' citizen science project, hosted\non the Zooniverse platform. In this work, we demonstrate this framework and the\nselection methodology which resulted in a measured reduction in volunteer\neffort by more than 60%. We envision this type of joint human-machine\npartnership will be of great use on future Zooniverse projects.\n","authors":["Ramanakumar Sankar","Kameswara Mantha","Lucy Fortson","Helen Spiers","Thomas Pengo","Douglas Mashek","Myat Mo","Mark Sanders","Trace Christensen","Jeffrey Salisbury","Laura Trouille"],"pdf_url":"https://arxiv.org/pdf/2311.14177v1.pdf","comment":"5 pages, 1 figure, accepted for publication at HLDM '23 (ECML PKDD\n 2023 workshop)"},{"id":"http://arxiv.org/abs/2311.14175v1","updated":"2023-11-23T19:44:50Z","published":"2023-11-23T19:44:50Z","title":"Appearance-based gaze estimation enhanced with synthetic images using\n deep neural networks","summary":" Human eye gaze estimation is an important cognitive ingredient for successful\nhuman-robot interaction, enabling the robot to read and predict human behavior.\nWe approach this problem using artificial neural networks and build a modular\nsystem estimating gaze from separately cropped eyes, taking advantage of\nexisting well-functioning components for face detection (RetinaFace) and head\npose estimation (6DRepNet). Our proposed method does not require any special\nhardware or infrared filters but uses a standard notebook-builtin RGB camera,\nas often approached with appearance-based methods. Using the MetaHuman tool, we\nalso generated a large synthetic dataset of more than 57,000 human faces and\nmade it publicly available. The inclusion of this dataset (with eye gaze and\nhead pose information) on top of the standard Columbia Gaze dataset into\ntraining the model led to better accuracy with a mean average error below two\ndegrees in eye pitch and yaw directions, which compares favourably to related\nmethods. We also verified the feasibility of our model by its preliminary\ntesting in real-world setting using the builtin 4K camera in NICO semi-humanoid\nrobot's eye.\n","authors":["Dmytro Herashchenko","Igor Farkaš"],"pdf_url":"https://arxiv.org/pdf/2311.14175v1.pdf","comment":"6 pages, 10 figures, accepted to 2023 IEEE Symposium Series on\n Computational Intelligence"},{"id":"http://arxiv.org/abs/2311.13559v2","updated":"2023-11-23T19:10:01Z","published":"2023-11-22T18:09:42Z","title":"Transfer Learning-based Real-time Handgun Detection","summary":" Traditional surveillance systems rely on human attention, limiting their\neffectiveness. This study employs convolutional neural networks and transfer\nlearning to develop a real-time computer vision system for automatic handgun\ndetection. Comprehensive analysis of online handgun detection methods is\nconducted, emphasizing reducing false positives and learning time. Transfer\nlearning is demonstrated as an effective approach. Despite technical\nchallenges, the proposed system achieves a precision rate of 84.74%,\ndemonstrating promising performance comparable to related works, enabling\nfaster learning and accurate automatic handgun detection for enhanced security.\nThis research advances security measures by reducing human monitoring\ndependence, showcasing the potential of transfer learning-based approaches for\nefficient and reliable handgun detection.\n","authors":["Youssef Elmir","Sid Ahmed Laouar","Larbi Hamdaoui"],"pdf_url":"https://arxiv.org/pdf/2311.13559v2.pdf","comment":"16 pages, 9 figures, and 3 tables. Accepted at The Iraqi Journal of\n Science, issued by College of Science at University of Baghdad"},{"id":"http://arxiv.org/abs/2311.13199v2","updated":"2023-11-23T18:59:55Z","published":"2023-11-22T07:06:38Z","title":"DRIFu: Differentiable Rendering and Implicit Function-based Single-View\n 3D Reconstruction","summary":" The Differentiable Rendering and Implicit Function-based model (DRIFu) draws\nits roots from the Pixel-aligned Implicit Function (PIFU), a pioneering 3D\ndigitization technique initially designed for clothed human bodies. PIFU excels\nin capturing nuanced body shape variations within a low-dimensional space and\nhas been extensively trained on human 3D scans. However, the application of\nPIFU to live animals poses significant challenges, primarily due to the\ninherent difficulty in obtaining the cooperation of animals for 3D scanning. In\nresponse to this challenge, we introduce the DRIFu model, specifically tailored\nfor animal digitization. To train DRIFu, we employ a curated set of synthetic\n3D animal models, encompassing diverse shapes, sizes, and even accounting for\nvariations such as baby birds. Our innovative alignment tools play a pivotal\nrole in mapping these diverse synthetic animal models onto a unified template,\nfacilitating precise predictions of animal shape and texture. Crucially, our\ntemplate alignment strategy establishes a shared shape space, allowing for the\nseamless sampling of new animal shapes, posing them realistically, animating\nthem, and aligning them with real-world data. This groundbreaking approach\nrevolutionizes our capacity to comprehensively understand and represent avian\nforms. For further details and access to the project, the project website can\nbe found at https://github.com/kuangzijian/drifu-for-animals\n","authors":["Zijian Kuang","Lihang Ying","Shi Jin","Li Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.13199v2.pdf","comment":"arXiv admin note: text overlap with arXiv:1905.05172 by other authors"},{"id":"http://arxiv.org/abs/2311.14155v1","updated":"2023-11-23T18:55:03Z","published":"2023-11-23T18:55:03Z","title":"GigaPose: Fast and Robust Novel Object Pose Estimation via One\n Correspondence","summary":" We present GigaPose, a fast, robust, and accurate method for CAD-based novel\nobject pose estimation in RGB images. GigaPose first leverages discriminative\ntemplates, rendered images of the CAD models, to recover the out-of-plane\nrotation and then uses patch correspondences to estimate the four remaining\nparameters. Our approach samples templates in only a two-degrees-of-freedom\nspace instead of the usual three and matches the input image to the templates\nusing fast nearest neighbor search in feature space, results in a speedup\nfactor of 38x compared to the state of the art. Moreover, GigaPose is\nsignificantly more robust to segmentation errors. Our extensive evaluation on\nthe seven core datasets of the BOP challenge demonstrates that it achieves\nstate-of-the-art accuracy and can be seamlessly integrated with a refinement\nmethod. Additionally, we show the potential of GigaPose with 3D models\npredicted by recent work on 3D reconstruction from a single image, relaxing the\nneed for CAD models and making 6D pose object estimation much more convenient.\nOur source code and trained models are publicly available at\nhttps://github.com/nv-nguyen/gigaPose\n","authors":["Van Nguyen Nguyen","Thibault Groueix","Mathieu Salzmann","Vincent Lepetit"],"pdf_url":"https://arxiv.org/pdf/2311.14155v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14148v1","updated":"2023-11-23T18:37:26Z","published":"2023-11-23T18:37:26Z","title":"Automated 3D Tumor Segmentation using Temporal Cubic PatchGAN (TCuP-GAN)","summary":" Development of robust general purpose 3D segmentation frameworks using the\nlatest deep learning techniques is one of the active topics in various\nbio-medical domains. In this work, we introduce Temporal Cubic PatchGAN\n(TCuP-GAN), a volume-to-volume translational model that marries the concepts of\na generative feature learning framework with Convolutional Long Short-Term\nMemory Networks (LSTMs), for the task of 3D segmentation. We demonstrate the\ncapabilities of our TCuP-GAN on the data from four segmentation challenges\n(Adult Glioma, Meningioma, Pediatric Tumors, and Sub-Saharan Africa subset)\nfeatured within the 2023 Brain Tumor Segmentation (BraTS) Challenge and\nquantify its performance using LesionWise Dice similarity and $95\\%$ Hausdorff\nDistance metrics. We demonstrate the successful learning of our framework to\npredict robust multi-class segmentation masks across all the challenges. This\nbenchmarking work serves as a stepping stone for future efforts towards\napplying TCuP-GAN on other multi-class tasks such as multi-organelle\nsegmentation in electron microscopy imaging.\n","authors":["Kameswara Bharadwaj Mantha","Ramanakumar Sankar","Lucy Fortson"],"pdf_url":"https://arxiv.org/pdf/2311.14148v1.pdf","comment":"Submitted as a short paper to the proceedings of the 2023 Brain Tumor\n Segmentation (BraTS) Challenge"},{"id":"http://arxiv.org/abs/2311.14146v1","updated":"2023-11-23T18:35:26Z","published":"2023-11-23T18:35:26Z","title":"Class Balanced Dynamic Acquisition for Domain Adaptive Semantic\n Segmentation using Active Learning","summary":" Domain adaptive active learning is leading the charge in label-efficient\ntraining of neural networks. For semantic segmentation, state-of-the-art models\njointly use two criteria of uncertainty and diversity to select training\nlabels, combined with a pixel-wise acquisition strategy. However, we show that\nsuch methods currently suffer from a class imbalance issue which degrades their\nperformance for larger active learning budgets. We then introduce Class\nBalanced Dynamic Acquisition (CBDA), a novel active learning method that\nmitigates this issue, especially in high-budget regimes. The more balanced\nlabels increase minority class performance, which in turn allows the model to\noutperform the previous baseline by 0.6, 1.7, and 2.4 mIoU for budgets of 5%,\n10%, and 20%, respectively. Additionally, the focus on minority classes leads\nto improvements of the minimum class performance of 0.5, 2.9, and 4.6 IoU\nrespectively. The top-performing model even exceeds the fully supervised\nbaseline, showing that a more balanced label than the entire ground truth can\nbe beneficial.\n","authors":["Marc Schachtsiek","Simone Rossi","Thomas Hannagan"],"pdf_url":"https://arxiv.org/pdf/2311.14146v1.pdf","comment":"NeurIPS 2023 Workshop on Adaptive Experimental Design and Active\n Learning in the Real World"},{"id":"http://arxiv.org/abs/2311.12024v2","updated":"2023-11-23T17:59:42Z","published":"2023-11-20T18:57:55Z","title":"PF-LRM: Pose-Free Large Reconstruction Model for Joint Pose and Shape\n Prediction","summary":" We propose a Pose-Free Large Reconstruction Model (PF-LRM) for reconstructing\na 3D object from a few unposed images even with little visual overlap, while\nsimultaneously estimating the relative camera poses in ~1.3 seconds on a single\nA100 GPU. PF-LRM is a highly scalable method utilizing the self-attention\nblocks to exchange information between 3D object tokens and 2D image tokens; we\npredict a coarse point cloud for each view, and then use a differentiable\nPerspective-n-Point (PnP) solver to obtain camera poses. When trained on a huge\namount of multi-view posed data of ~1M objects, PF-LRM shows strong\ncross-dataset generalization ability, and outperforms baseline methods by a\nlarge margin in terms of pose prediction accuracy and 3D reconstruction quality\non various unseen evaluation datasets. We also demonstrate our model's\napplicability in downstream text/image-to-3D task with fast feed-forward\ninference. Our project website is at: https://totoro97.github.io/pf-lrm .\n","authors":["Peng Wang","Hao Tan","Sai Bi","Yinghao Xu","Fujun Luan","Kalyan Sunkavalli","Wenping Wang","Zexiang Xu","Kai Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.12024v2.pdf","comment":"Project website: https://totoro97.github.io/pf-lrm ; add more\n experiments"},{"id":"http://arxiv.org/abs/2305.01569v2","updated":"2023-11-23T17:07:58Z","published":"2023-05-02T16:18:11Z","title":"Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image\n Generation","summary":" The ability to collect a large dataset of human preferences from\ntext-to-image users is usually limited to companies, making such datasets\ninaccessible to the public. To address this issue, we create a web app that\nenables text-to-image users to generate images and specify their preferences.\nUsing this web app we build Pick-a-Pic, a large, open dataset of text-to-image\nprompts and real users' preferences over generated images. We leverage this\ndataset to train a CLIP-based scoring function, PickScore, which exhibits\nsuperhuman performance on the task of predicting human preferences. Then, we\ntest PickScore's ability to perform model evaluation and observe that it\ncorrelates better with human rankings than other automatic evaluation metrics.\nTherefore, we recommend using PickScore for evaluating future text-to-image\ngeneration models, and using Pick-a-Pic prompts as a more relevant dataset than\nMS-COCO. Finally, we demonstrate how PickScore can enhance existing\ntext-to-image models via ranking.\n","authors":["Yuval Kirstain","Adam Polyak","Uriel Singer","Shahbuland Matiana","Joe Penna","Omer Levy"],"pdf_url":"https://arxiv.org/pdf/2305.01569v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.02205v3","updated":"2023-11-23T17:04:26Z","published":"2022-03-04T09:31:20Z","title":"Evaluating Object (mis)Detection from a Safety and Reliability\n Perspective: Discussion and Measures","summary":" We argue that object detectors in the safety critical domain should\nprioritize detection of objects that are most likely to interfere with the\nactions of the autonomous actor. Especially, this applies to objects that can\nimpact the actor's safety and reliability. To quantify the impact of object\n(mis)detection on safety and reliability in the context of autonomous driving,\nwe propose new object detection measures that reward the correct identification\nof objects that are most dangerous and most likely to affect driving decisions.\nTo achieve this, we build an object criticality model to reward the detection\nof the objects based on proximity, orientation, and relative velocity with\nrespect to the subject vehicle. Then, we apply our model on the recent\nautonomous driving dataset nuScenes, and we compare nine object detectors.\nResults show that, in several settings, object detectors that perform best\naccording to the nuScenes ranking are not the preferable ones when the focus is\nshifted on safety and reliability.\n","authors":["Andrea Ceccarelli","Leonardo Montecchi"],"pdf_url":"https://arxiv.org/pdf/2203.02205v3.pdf","comment":"journal version, open access"},{"id":"http://arxiv.org/abs/2212.05911v2","updated":"2023-11-23T16:49:58Z","published":"2022-12-07T15:10:40Z","title":"Adaptive Self-Training for Object Detection","summary":" Deep learning has emerged as an effective solution for solving the task of\nobject detection in images but at the cost of requiring large labeled datasets.\nTo mitigate this cost, semi-supervised object detection methods, which consist\nin leveraging abundant unlabeled data, have been proposed and have already\nshown impressive results. However, most of these methods require linking a\npseudo-label to a ground-truth object by thresholding. In previous works, this\nthreshold value is usually determined empirically, which is time consuming, and\nonly done for a single data distribution. When the domain, and thus the data\ndistribution, changes, a new and costly parameter search is necessary. In this\nwork, we introduce our method Adaptive Self-Training for Object Detection\n(ASTOD), which is a simple yet effective teacher-student method. ASTOD\ndetermines without cost a threshold value based directly on the ground value of\nthe score histogram. To improve the quality of the teacher predictions, we also\npropose a novel pseudo-labeling procedure. We use different views of the\nunlabeled images during the pseudo-labeling step to reduce the number of missed\npredictions and thus obtain better candidate labels. Our teacher and our\nstudent are trained separately, and our method can be used in an iterative\nfashion by replacing the teacher by the student. On the MS-COCO dataset, our\nmethod consistently performs favorably against state-of-the-art methods that do\nnot require a threshold parameter, and shows competitive results with methods\nthat require a parameter sweep search. Additional experiments with respect to a\nsupervised baseline on the DIOR dataset containing satellite images lead to\nsimilar conclusions, and prove that it is possible to adapt the score threshold\nautomatically in self-training, regardless of the data distribution. The code\nis available at https:// github.com/rvandeghen/ASTOD\n","authors":["Renaud Vandeghen","Gilles Louppe","Marc Van Droogenbroeck"],"pdf_url":"https://arxiv.org/pdf/2212.05911v2.pdf","comment":"10 pages, 4 figures, 5 tables, 1 page of supplementary material"},{"id":"http://arxiv.org/abs/2311.14097v1","updated":"2023-11-23T16:49:06Z","published":"2023-11-23T16:49:06Z","title":"ACT: Adversarial Consistency Models","summary":" Though diffusion models excel in image generation, their step-by-step\ndenoising leads to slow generation speeds. Consistency training addresses this\nissue with single-step sampling but often produces lower-quality generations\nand requires high training costs. In this paper, we show that optimizing\nconsistency training loss minimizes the Wasserstein distance between target and\ngenerated distributions. As timestep increases, the upper bound accumulates\nprevious consistency training losses. Therefore, larger batch sizes are needed\nto reduce both current and accumulated losses. We propose Adversarial\nConsistency Training (ACT), which directly minimizes the Jensen-Shannon (JS)\ndivergence between distributions at each timestep using a discriminator.\nTheoretically, ACT enhances generation quality, and convergence. By\nincorporating a discriminator into the consistency training framework, our\nmethod achieves improved FID scores on CIFAR10 and ImageNet 64$\\times$64,\nretains zero-shot image inpainting capabilities, and uses less than $1/6$ of\nthe original batch size and fewer than $1/2$ of the model parameters and\ntraining steps compared to the baseline method, this leads to a substantial\nreduction in resource consumption.\n","authors":["Fei Kong","Jinhao Duan","Lichao Sun","Hao Cheng","Renjing Xu","Hengtao Shen","Xiaofeng Zhu","Xiaoshuang Shi","Kaidi Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14097v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14095v1","updated":"2023-11-23T16:41:30Z","published":"2023-11-23T16:41:30Z","title":"Video Anomaly Detection using GAN","summary":" Accounting for the increased concern for public safety, automatic abnormal\nevent detection and recognition in a surveillance scene is crucial. It is a\ncurrent open study subject because of its intricacy and utility. The\nidentification of aberrant events automatically, it's a difficult undertaking\nbecause everyone's idea of abnormality is different. A typical occurrence in\none circumstance could be seen as aberrant in another. Automatic anomaly\nidentification becomes particularly challenging in the surveillance footage\nwith a large crowd due to congestion and high occlusion. With the use of\nmachine learning techniques, this thesis study aims to offer the solution for\nthis use case so that human resources won't be required to keep an eye out for\nany unusual activity in the surveillance system records. We have developed a\nnovel generative adversarial network (GAN) based anomaly detection model. This\nmodel is trained such that it learns together about constructing a high\ndimensional picture space and determining the latent space from the video's\ncontext. The generator uses a residual Autoencoder architecture made up of a\nmulti-stage channel attention-based decoder and a two-stream, deep\nconvolutional encoder that can realise both spatial and temporal data. We have\nalso offered a technique for refining the GAN model that reduces training time\nwhile also generalising the model by utilising transfer learning between\ndatasets. Using a variety of assessment measures, we compare our model to the\ncurrent state-of-the-art techniques on four benchmark datasets. The empirical\nfindings indicate that, in comparison to existing techniques, our network\nperforms favourably on all datasets.\n","authors":["Anikeit Sethi","Krishanu Saini","Sai Mounika Mididoddi"],"pdf_url":"https://arxiv.org/pdf/2311.14095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12651v2","updated":"2023-11-23T16:38:00Z","published":"2023-11-21T14:53:02Z","title":"Mobile-Seed: Joint Semantic Segmentation and Boundary Detection for\n Mobile Robots","summary":" Precise and rapid delineation of sharp boundaries and robust semantics is\nessential for numerous downstream robotic tasks, such as robot grasping and\nmanipulation, real-time semantic mapping, and online sensor calibration\nperformed on edge computing units. Although boundary detection and semantic\nsegmentation are complementary tasks, most studies focus on lightweight models\nfor semantic segmentation but overlook the critical role of boundary detection.\nIn this work, we introduce Mobile-Seed, a lightweight, dual-task framework\ntailored for simultaneous semantic segmentation and boundary detection. Our\nframework features a two-stream encoder, an active fusion decoder (AFD) and a\ndual-task regularization approach. The encoder is divided into two pathways:\none captures category-aware semantic information, while the other discerns\nboundaries from multi-scale features. The AFD module dynamically adapts the\nfusion of semantic and boundary information by learning channel-wise\nrelationships, allowing for precise weight assignment of each channel.\nFurthermore, we introduce a regularization loss to mitigate the conflicts in\ndual-task learning and deep diversity supervision. Compared to existing\nmethods, the proposed Mobile-Seed offers a lightweight framework to\nsimultaneously improve semantic segmentation performance and accurately locate\nobject boundaries. Experiments on the Cityscapes dataset have shown that\nMobile-Seed achieves notable improvement over the state-of-the-art (SOTA)\nbaseline by 2.2 percentage points (pp) in mIoU and 4.2 pp in mF-score, while\nmaintaining an online inference speed of 23.9 frames-per-second (FPS) with\n1024x2048 resolution input on an RTX 2080 Ti GPU. Additional experiments on\nCamVid and PASCAL Context datasets confirm our method's generalizability. Code\nand additional results are publicly available at\nhttps://whu-usi3dv.github.io/Mobile-Seed/.\n","authors":["Youqi Liao","Shuhao Kang","Jianping Li","Yang Liu","Yun Liu","Zhen Dong","Bisheng Yang","Xieyuanli Chen"],"pdf_url":"https://arxiv.org/pdf/2311.12651v2.pdf","comment":"8 pages, IEEE conference/letter underreview. Code and additional\n results are available at: https://github.com/WHU-USI3DV/Mobile-Seed"},{"id":"http://arxiv.org/abs/2305.19621v2","updated":"2023-11-23T16:37:25Z","published":"2023-05-31T07:41:10Z","title":"XTransCT: Ultra-Fast Volumetric CT Reconstruction using Two Orthogonal\n X-Ray Projections for Image-guided Radiation Therapy via a Transformer\n Network","summary":" Computed tomography (CT) scans offer a detailed, three-dimensional\nrepresentation of patients' internal organs. However, conventional CT\nreconstruction techniques necessitate acquiring hundreds or thousands of x-ray\nprojections through a complete rotational scan of the body, making navigation\nor positioning during surgery infeasible. In image-guided radiation therapy, a\nmethod that reconstructs ultra-sparse X-ray projections into CT images, we can\nexploit the substantially reduced radiation dose and minimize equipment burden\nfor localization and navigation. In this study, we introduce a novel\nTransformer architecture, termed XTransCT, devised to facilitate real-time\nreconstruction of CT images from two-dimensional X-ray images. We assess our\napproach regarding image quality and structural reliability using a dataset of\nfifty patients, supplied by a hospital, as well as the larger public dataset\nLIDC-IDRI, which encompasses thousands of patients. Additionally, we validated\nour algorithm's generalizability on the LNDb dataset. Our findings indicate\nthat our algorithm surpasses other methods in image quality, structural\nprecision, and generalizability. Moreover, in comparison to previous 3D\nconvolution-based approaches, we note a substantial speed increase of\napproximately 300 %, achieving 44 ms per 3D image reconstruction.\n","authors":["Chulong Zhang","Lin Liu","Jingjing Dai","Xuan Liu","Wenfeng He","Yinping Chan","Yaoqin Xie","Feng Chi","Xiaokun Liang"],"pdf_url":"https://arxiv.org/pdf/2305.19621v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14090v1","updated":"2023-11-23T16:36:03Z","published":"2023-11-23T16:36:03Z","title":"Class Uncertainty: A Measure to Mitigate Class Imbalance","summary":" Class-wise characteristics of training examples affect the performance of\ndeep classifiers. A well-studied example is when the number of training\nexamples of classes follows a long-tailed distribution, a situation that is\nlikely to yield sub-optimal performance for under-represented classes. This\nclass imbalance problem is conventionally addressed by approaches relying on\nthe class-wise cardinality of training examples, such as data resampling. In\nthis paper, we demonstrate that considering solely the cardinality of classes\ndoes not cover all issues causing class imbalance. To measure class imbalance,\nwe propose \"Class Uncertainty\" as the average predictive uncertainty of the\ntraining examples, and we show that this novel measure captures the differences\nacross classes better than cardinality. We also curate SVCI-20 as a novel\ndataset in which the classes have equal number of training examples but they\ndiffer in terms of their hardness; thereby causing a type of class imbalance\nwhich cannot be addressed by the approaches relying on cardinality. We\nincorporate our \"Class Uncertainty\" measure into a diverse set of ten class\nimbalance mitigation methods to demonstrate its effectiveness on long-tailed\ndatasets as well as on our SVCI-20. Code and datasets will be made available.\n","authors":["Z. S. Baltaci","K. Oksuz","S. Kuzucu","K. Tezoren","B. K. Konar","A. Ozkan","E. Akbas","S. Kalkan"],"pdf_url":"https://arxiv.org/pdf/2311.14090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14086v1","updated":"2023-11-23T16:24:26Z","published":"2023-11-23T16:24:26Z","title":"Brain MRI Screening Tool with Federated Learning","summary":" In clinical practice, we often see significant delays between MRI scans and\nthe diagnosis made by radiologists, even for severe cases. In some cases, this\nmay be caused by the lack of additional information and clues, so even the\nsevere cases need to wait in the queue for diagnosis. This can be avoided if\nthere is an automatic software tool, which would supplement additional\ninformation, alerting radiologists that the particular patient may be a severe\ncase.\n We are presenting an automatic brain MRI Screening Tool and we are\ndemonstrating its capabilities for detecting tumor-like pathologies. It is the\nfirst version on the path toward a robust multi-pathology screening solution.\nThe tool supports Federated Learning, so multiple institutions may contribute\nto the model without disclosing their private data.\n","authors":["Roman Stoklasa","Ioannis Stathopoulos","Efstratios Karavasilis","Efstathios Efstathopoulos","Marek Dostál","Miloš Keřkovský","Michal Kozubek","Luigi Serio"],"pdf_url":"https://arxiv.org/pdf/2311.14086v1.pdf","comment":"5 pages, 2 figures. Submitted to ISBI 2024 conference"},{"id":"http://arxiv.org/abs/2311.14084v1","updated":"2023-11-23T16:22:58Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon has elevated the issue of source bias in text retrieval for web\nsearches. Specifically, neural retrieval models tend to rank generated texts\nhigher than human-written texts. In this paper, we extend the study of this\nbias to cross-modal retrieval. Firstly, we successfully construct a suitable\nbenchmark to explore the existence of the bias. Subsequent extensive\nexperiments on this benchmark reveal that AI-generated images introduce an\ninvisible relevance bias to text-image retrieval models. Specifically, our\nexperiments show that text-image retrieval models tend to rank the AI-generated\nimages higher than the real images, even though the AI-generated images do not\nexhibit more visually relevant features to the query than real images. This\ninvisible relevance bias is prevalent across retrieval models with varying\ntraining data and architectures. Furthermore, our subsequent exploration\nreveals that the inclusion of AI-generated images in the training data of the\nretrieval models exacerbates the invisible relevance bias. The above phenomenon\ntriggers a vicious cycle, which makes the invisible relevance bias become more\nand more serious. To elucidate the potential causes of invisible relevance and\naddress the aforementioned issues, we introduce an effective training method\naimed at alleviating the invisible relevance bias. Subsequently, we apply our\nproposed debiasing method to retroactively identify the causes of invisible\nrelevance, revealing that the AI-generated images induce the image encoder to\nembed additional information into their representation. This information\nexhibits a certain consistency across generated images with different semantics\nand can make the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.14081v1","updated":"2023-11-23T16:19:59Z","published":"2023-11-23T16:19:59Z","title":"You Only Explain Once","summary":" In this paper, we propose a new black-box explainability algorithm and tool,\nYO-ReX, for efficient explanation of the outputs of object detectors. The new\nalgorithm computes explanations for all objects detected in the image\nsimultaneously. Hence, compared to the baseline, the new algorithm reduces the\nnumber of queries by a factor of 10X for the case of ten detected objects. The\nspeedup increases further with with the number of objects. Our experimental\nresults demonstrate that YO-ReX can explain the outputs of YOLO with a\nnegligible overhead over the running time of YOLO. We also demonstrate similar\nresults for explaining SSD and Faster R-CNN. The speedup is achieved by\navoiding backtracking by combining aggressive pruning with a causal analysis.\n","authors":["David A. Kelly","Hana Chockler","Daniel Kroening","Nathan Blake","Aditi Ramaswamy","Melane Navaratnarajah","Aaditya Shivakumar"],"pdf_url":"https://arxiv.org/pdf/2311.14081v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14073v1","updated":"2023-11-23T16:04:41Z","published":"2023-11-23T16:04:41Z","title":"Learning Saliency From Fixations","summary":" We present a novel approach for saliency prediction in images, leveraging\nparallel decoding in transformers to learn saliency solely from fixation maps.\nModels typically rely on continuous saliency maps, to overcome the difficulty\nof optimizing for the discrete fixation map. We attempt to replicate the\nexperimental setup that generates saliency datasets. Our approach treats\nsaliency prediction as a direct set prediction problem, via a global loss that\nenforces unique fixations prediction through bipartite matching and a\ntransformer encoder-decoder architecture. By utilizing a fixed set of learned\nfixation queries, the cross-attention reasons over the image features to\ndirectly output the fixation points, distinguishing it from other modern\nsaliency predictors. Our approach, named Saliency TRansformer (SalTR), achieves\nmetric scores on par with state-of-the-art approaches on the Salicon and MIT300\nbenchmarks.\n","authors":["Yasser Abdelaziz Dahou Djilali","Kevin McGuiness","Noel O'Connor"],"pdf_url":"https://arxiv.org/pdf/2311.14073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10192v2","updated":"2023-11-23T15:44:52Z","published":"2023-08-20T07:45:03Z","title":"EDDense-Net: Fully Dense Encoder Decoder Network for Joint Segmentation\n of Optic Cup and Disc","summary":" Glaucoma is an eye disease that causes damage to the optic nerve, which can\nlead to visual loss and permanent blindness. Early glaucoma detection is\ntherefore critical in order to avoid permanent blindness. The estimation of the\ncup-to-disc ratio (CDR) during an examination of the optical disc (OD) is used\nfor the diagnosis of glaucoma. In this paper, we present the EDDense-Net\nsegmentation network for the joint segmentation of OC and OD. The encoder and\ndecoder in this network are made up of dense blocks with a grouped\nconvolutional layer in each block, allowing the network to acquire and convey\nspatial information from the image while simultaneously reducing the network's\ncomplexity. To reduce spatial information loss, the optimal number of filters\nin all convolution layers were utilised. In semantic segmentation, dice pixel\nclassification is employed in the decoder to alleviate the problem of class\nimbalance. The proposed network was evaluated on two publicly available\ndatasets where it outperformed existing state-of-the-art methods in terms of\naccuracy and efficiency. For the diagnosis and analysis of glaucoma, this\nmethod can be used as a second opinion system to assist medical\nophthalmologists.\n","authors":["Mehwish Mehmood","Khuram Naveed","Khursheed Aurangzeb","Haroon Ahmed Khan","Musaed Alhussein","Syed Saud Naqvi"],"pdf_url":"https://arxiv.org/pdf/2308.10192v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14064v1","updated":"2023-11-23T15:42:42Z","published":"2023-11-23T15:42:42Z","title":"HGCLIP: Exploring Vision-Language Models with Graph Representations for\n Hierarchical Understanding","summary":" Object categories are typically organized into a multi-granularity taxonomic\nhierarchy. When classifying categories at different hierarchy levels,\ntraditional uni-modal approaches focus primarily on image features, revealing\nlimitations in complex scenarios. Recent studies integrating Vision-Language\nModels (VLMs) with class hierarchies have shown promise, yet they fall short of\nfully exploiting the hierarchical relationships. These efforts are constrained\nby their inability to perform effectively across varied granularity of\ncategories. To tackle this issue, we propose a novel framework (HGCLIP) that\neffectively combines CLIP with a deeper exploitation of the Hierarchical class\nstructure via Graph representation learning. We explore constructing the class\nhierarchy into a graph, with its nodes representing the textual or image\nfeatures of each category. After passing through a graph encoder, the textual\nfeatures incorporate hierarchical structure information, while the image\nfeatures emphasize class-aware features derived from prototypes through the\nattention mechanism. Our approach demonstrates significant improvements on both\ngeneric and fine-grained visual recognition benchmarks. Our codes are fully\navailable at https://github.com/richard-peng-xia/HGCLIP.\n","authors":["Peng Xia","Xingtong Yu","Ming Hu","Lie Ju","Zhiyong Wang","Peibo Duan","Zongyuan Ge"],"pdf_url":"https://arxiv.org/pdf/2311.14064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14063v1","updated":"2023-11-23T15:42:00Z","published":"2023-11-23T15:42:00Z","title":"Do VSR Models Generalize Beyond LRS3?","summary":" The Lip Reading Sentences-3 (LRS3) benchmark has primarily been the focus of\nintense research in visual speech recognition (VSR) during the last few years.\nAs a result, there is an increased risk of overfitting to its excessively used\ntest set, which is only one hour duration. To alleviate this issue, we build a\nnew VSR test set named WildVSR, by closely following the LRS3 dataset creation\nprocesses. We then evaluate and analyse the extent to which the current VSR\nmodels generalize to the new test data. We evaluate a broad range of publicly\navailable VSR models and find significant drops in performance on our test set,\ncompared to their corresponding LRS3 results. Our results suggest that the\nincrease in word error rates is caused by the models inability to generalize to\nslightly harder and in the wild lip sequences than those found in the LRS3 test\nset. Our new test benchmark is made public in order to enable future research\ntowards more robust VSR models.\n","authors":["Yasser Abdelaziz Dahou Djilali","Sanath Narayan","Eustache Le Bihan","Haithem Boussaid","Ebtessam Almazrouei","Merouane Debbah"],"pdf_url":"https://arxiv.org/pdf/2311.14063v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14062v1","updated":"2023-11-23T15:38:13Z","published":"2023-11-23T15:38:13Z","title":"Hardware Resilience Properties of Text-Guided Image Classifiers","summary":" This paper presents a novel method to enhance the reliability of image\nclassification models during deployment in the face of transient hardware\nerrors. By utilizing enriched text embeddings derived from GPT-3 with question\nprompts per class and CLIP pretrained text encoder, we investigate their impact\nas an initialization for the classification layer. Our approach achieves a\nremarkable $5.5\\times$ average increase in hardware reliability (and up to 14x)\nacross various architectures in the most critical layer, with minimal accuracy\ndrop (0.3% on average) compared to baseline PyTorch models. Furthermore, our\nmethod seamlessly integrates with any image classification backbone, showcases\nresults across various network architectures, decreases parameter and FLOPs\noverhead, and follows a consistent training recipe. This research offers a\npractical and efficient solution to bolster the robustness of image\nclassification models against hardware failures, with potential implications\nfor future studies in this domain. Our code and models are released at\nhttps://github.com/TalalWasim/TextGuidedResilience.\n","authors":["Syed Talal Wasim","Kabila Haile Saboka","Abdulrahman Mahmoud","Salman Khan","David Brooks","Gu-Yeon Wei"],"pdf_url":"https://arxiv.org/pdf/2311.14062v1.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.12476v3","updated":"2023-11-23T15:22:39Z","published":"2023-05-21T14:40:48Z","title":"Zero-shot Visual Relation Detection via Composite Visual Cues from Large\n Language Models","summary":" Pretrained vision-language models, such as CLIP, have demonstrated strong\ngeneralization capabilities, making them promising tools in the realm of\nzero-shot visual recognition. Visual relation detection (VRD) is a typical task\nthat identifies relationship (or interaction) types between object pairs within\nan image. However, naively utilizing CLIP with prevalent class-based prompts\nfor zero-shot VRD has several weaknesses, e.g., it struggles to distinguish\nbetween different fine-grained relation types and it neglects essential spatial\ninformation of two objects. To this end, we propose a novel method for\nzero-shot VRD: RECODE, which solves RElation detection via COmposite\nDEscription prompts. Specifically, RECODE first decomposes each predicate\ncategory into subject, object, and spatial components. Then, it leverages large\nlanguage models (LLMs) to generate description-based prompts (or visual cues)\nfor each component. Different visual cues enhance the discriminability of\nsimilar relation categories from different perspectives, which significantly\nboosts performance in VRD. To dynamically fuse different cues, we further\nintroduce a chain-of-thought method that prompts LLMs to generate reasonable\nweights for different visual cues. Extensive experiments on four VRD benchmarks\nhave demonstrated the effectiveness and interpretability of RECODE.\n","authors":["Lin Li","Jun Xiao","Guikun Chen","Jian Shao","Yueting Zhuang","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2305.12476v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14049v1","updated":"2023-11-23T15:05:12Z","published":"2023-11-23T15:05:12Z","title":"Assessment of Deep Learning Segmentation for Real-Time Free-Breathing\n Cardiac Magnetic Resonance Imaging","summary":" In recent years, a variety of deep learning networks for cardiac MRI (CMR)\nsegmentation have been developed and analyzed. However, nearly all of them are\nfocused on cine CMR under breathold. In this work, accuracy of deep learning\nmethods is assessed for volumetric analysis (via segmentation) of the left\nventricle in real-time free-breathing CMR at rest and under exercise stress.\nData from healthy volunteers (n=15) for cine and real-time free-breathing CMR\nwere analyzed retrospectively. Segmentations of a commercial software (comDL)\nand a freely available neural network (nnU-Net), were compared to a reference\ncreated via the manual correction of comDL segmentation. Segmentation of left\nventricular endocardium (LV), left ventricular myocardium (MYO), and right\nventricle (RV) is evaluated for both end-systolic and end-diastolic phases and\nanalyzed with Dice's coefficient (DC). The volumetric analysis includes LV\nend-diastolic volume (EDV), LV end-systolic volume (ESV), and LV ejection\nfraction (EF). For cine CMR, nnU-Net and comDL achieve a DC above 0.95 for LV\nand 0.9 for MYO, and RV. For real-time CMR, the accuracy of nnU-Net exceeds\nthat of comDL overall. For real-time CMR at rest, nnU-Net achieves a DC of 0.94\nfor LV, 0.89 for MYO, and 0.90 for RV; mean absolute differences between\nnnU-Net and reference are 2.9mL for EDV, 3.5mL for ESV and 2.6% for EF. For\nreal-time CMR under exercise stress, nnU-Net achieves a DC of 0.92 for LV, 0.85\nfor MYO, and 0.83 for RV; mean absolute differences between nnU-Net and\nreference are 11.4mL for EDV, 2.9mL for ESV and 3.6% for EF. Deep learning\nmethods designed or trained for cine CMR segmentation can perform well on\nreal-time CMR. For real-time free-breathing CMR at rest, the performance of\ndeep learning methods is comparable to inter-observer variability in cine CMR\nand is usable or fully automatic segmentation.\n","authors":["Martin Schilling","Christina Unterberg-Buchwald","Joachim Lotz","Martin Uecker"],"pdf_url":"https://arxiv.org/pdf/2311.14049v1.pdf","comment":"*These authors contributed equally to this work"},{"id":"http://arxiv.org/abs/2311.14029v1","updated":"2023-11-23T14:33:53Z","published":"2023-11-23T14:33:53Z","title":"Understanding the Vulnerability of CLIP to Image Compression","summary":" CLIP is a widely used foundational vision-language model that is used for\nzero-shot image recognition and other image-text alignment tasks. We\ndemonstrate that CLIP is vulnerable to change in image quality under\ncompression. This surprising result is further analysed using an attribution\nmethod-Integrated Gradients. Using this attribution method, we are able to\nbetter understand both quantitatively and qualitatively exactly the nature in\nwhich the compression affects the zero-shot recognition accuracy of this model.\nWe evaluate this extensively on CIFAR-10 and STL-10. Our work provides the\nbasis to understand this vulnerability of CLIP and can help us develop more\neffective methods to improve the robustness of CLIP and other vision-language\nmodels.\n","authors":["Cangxiong Chen","Vinay P. Namboodiri","Julian Padget"],"pdf_url":"https://arxiv.org/pdf/2311.14029v1.pdf","comment":"R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot Learning in\n Foundation Models at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.14028v1","updated":"2023-11-23T14:33:03Z","published":"2023-11-23T14:33:03Z","title":"Continual Learning of Diffusion Models with Generative Distillation","summary":" Diffusion models are powerful generative models that achieve state-of-the-art\nperformance in tasks such as image synthesis. However, training them demands\nsubstantial amounts of data and computational resources. Continual learning\nwould allow for incrementally learning new tasks and accumulating knowledge,\nthus reusing already trained models would be possible. One potentially suitable\napproach is generative replay, where a copy of a generative model trained on\nprevious tasks produces synthetic data that are interleaved with data from the\ncurrent task. However, standard generative replay applied to diffusion models\nresults in a catastrophic loss in denoising capabilities. In this paper, we\npropose generative distillation, an approach that distils the entire reverse\nprocess of a diffusion model. We demonstrate that our approach significantly\nimproves the continual learning performance of generative replay with only a\nmoderate increase in the computational costs.\n","authors":["Sergi Masip","Pau Rodriguez","Tinne Tuytelaars","Gido M. van de Ven"],"pdf_url":"https://arxiv.org/pdf/2311.14028v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14024v1","updated":"2023-11-23T14:28:28Z","published":"2023-11-23T14:28:28Z","title":"Creating and Benchmarking a Synthetic Dataset for Cloud Optical\n Thickness Estimation","summary":" Cloud formations often obscure optical satellite-based monitoring of the\nEarth's surface, thus limiting Earth observation (EO) activities such as land\ncover mapping, ocean color analysis, and cropland monitoring. The integration\nof machine learning (ML) methods within the remote sensing domain has\nsignificantly improved performance on a wide range of EO tasks, including cloud\ndetection and filtering, but there is still much room for improvement. A key\nbottleneck is that ML methods typically depend on large amounts of annotated\ndata for training, which is often difficult to come by in EO contexts. This is\nespecially true for the task of cloud optical thickness (COT) estimation. A\nreliable estimation of COT enables more fine-grained and application-dependent\ncontrol compared to using pre-specified cloud categories, as is commonly done\nin practice. To alleviate the COT data scarcity problem, in this work we\npropose a novel synthetic dataset for COT estimation, where top-of-atmosphere\nradiances have been simulated for 12 of the spectral bands of the\nMulti-Spectral Instrument (MSI) sensor onboard Sentinel-2 platforms. These data\npoints have been simulated under consideration of different cloud types, COTs,\nand ground surface and atmospheric profiles. Extensive experimentation of\ntraining several ML models to predict COT from the measured reflectivity of the\nspectral bands demonstrates the usefulness of our proposed dataset.\nGeneralization to real data is also demonstrated on two satellite image\ndatasets -- one that is publicly available, and one which we have collected and\nannotated. The synthetic data, the newly collected real dataset, code and\nmodels have been made publicly available at\nhttps://github.com/aleksispi/ml-cloud-opt-thick.\n","authors":["Aleksis Pirinen","Nosheen Abid","Nuria Agues Paszkowsky","Thomas Ohlson Timoudas","Ronald Scheirer","Chiara Ceccobello","György Kovács","Anders Persson"],"pdf_url":"https://arxiv.org/pdf/2311.14024v1.pdf","comment":"Code, data and models available at\n https://github.com/aleksispi/ml-cloud-opt-thick"},{"id":"http://arxiv.org/abs/2311.14012v1","updated":"2023-11-23T14:07:35Z","published":"2023-11-23T14:07:35Z","title":"Shadow: A Novel Loss Function for Efficient Training in Siamese Networks","summary":" Despite significant recent advances in similarity detection tasks, existing\napproaches pose substantial challenges under memory constraints. One of the\nprimary reasons for this is the use of computationally expensive metric\nlearning loss functions such as Triplet Loss in Siamese networks. In this\npaper, we present a novel loss function called Shadow Loss that compresses the\ndimensions of an embedding space during loss calculation without loss of\nperformance. The distance between the projections of the embeddings is learned\nfrom inputs on a compact projection space where distances directly correspond\nto a measure of class similarity. Projecting on a lower-dimension projection\nspace, our loss function converges faster, and the resulting classified image\nclusters have higher inter-class and smaller intra-class distances. Shadow Loss\nnot only reduces embedding dimensions favoring memory constraint devices but\nalso consistently performs better than the state-of-the-art Triplet Margin Loss\nby an accuracy of 5\\%-10\\% across diverse datasets. The proposed loss function\nis also model agnostic, upholding its performance across several tested models.\nIts effectiveness and robustness across balanced, imbalanced, medical, and\nnon-medical image datasets suggests that it is not specific to a particular\nmodel or dataset but demonstrates superior performance consistently while using\nless memory and computation.\n","authors":["Alif Elham Khan","Mohammad Junayed Hasan","Humayra Anjum","Nabeel Mohammed"],"pdf_url":"https://arxiv.org/pdf/2311.14012v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14006v1","updated":"2023-11-23T13:43:14Z","published":"2023-11-23T13:43:14Z","title":"High-resolution Population Maps Derived from Sentinel-1 and Sentinel-2","summary":" Detailed population maps play an important role in diverse fields ranging\nfrom humanitarian action to urban planning. Generating such maps in a timely\nand scalable manner presents a challenge, especially in data-scarce regions. To\naddress it we have developed POPCORN, a population mapping method whose only\ninputs are free, globally available satellite images from Sentinel-1 and\nSentinel-2; and a small number of aggregate population counts over coarse\ncensus districts for calibration. Despite the minimal data requirements our\napproach surpasses the mapping accuracy of existing schemes, including several\nthat rely on building footprints derived from high-resolution imagery. E.g., we\nwere able to produce population maps for Rwanda with 100m GSD based on less\nthan 400 regional census counts. In Kigali, those maps reach an $R^2$ score of\n66% w.r.t. a ground truth reference map, with an average error of only $\\pm$10\ninhabitants/ha. Conveniently, POPCORN retrieves explicit maps of built-up areas\nand of local building occupancy rates, making the mapping process interpretable\nand offering additional insights, for instance about the distribution of\nbuilt-up, but unpopulated areas, e.g., industrial warehouses. Moreover, we find\nthat, once trained, the model can be applied repeatedly to track population\nchanges; and that it can be transferred to geographically similar regions,\ne.g., from Uganda to Rwanda). With our work we aim to democratize access to\nup-to-date and high-resolution population maps, recognizing that some regions\nfaced with particularly strong population dynamics may lack the resources for\ncostly micro-census campaigns.\n","authors":["Nando Metzger","Rodrigo Caye Daudt","Devis Tuia","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2311.14006v1.pdf","comment":"17 pages, 10 tables, 7 Figures"},{"id":"http://arxiv.org/abs/2311.13997v1","updated":"2023-11-23T13:32:06Z","published":"2023-11-23T13:32:06Z","title":"GRJointNET: Synergistic Completion and Part Segmentation on 3D\n Incomplete Point Clouds","summary":" Segmentation of three-dimensional (3D) point clouds is an important task for\nautonomous systems. However, success of segmentation algorithms depends greatly\non the quality of the underlying point clouds (resolution, completeness etc.).\nIn particular, incomplete point clouds might reduce a downstream model's\nperformance. GRNet is proposed as a novel and recent deep learning solution to\ncomplete point clouds, but it is not capable of part segmentation. On the other\nhand, our proposed solution, GRJointNet, is an architecture that can perform\njoint completion and segmentation on point clouds as a successor of GRNet.\nFeatures extracted for the two tasks are also utilized by each other to\nincrease the overall performance. We evaluated our proposed network on the\nShapeNet-Part dataset and compared its performance to GRNet. Our results\ndemonstrate GRJointNet can outperform GRNet on point completion. It should also\nbe noted that GRNet is not capable of segmentation while GRJointNet is. This\nstudy1, therefore, holds a promise to enhance practicality and utility of point\nclouds in 3D vision for autonomous systems.\n","authors":["Yigit Gurses","Melisa Taspinar","Mahmut Yurt","Sedat Ozer"],"pdf_url":"https://arxiv.org/pdf/2311.13997v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13993v1","updated":"2023-11-23T13:20:42Z","published":"2023-11-23T13:20:42Z","title":"EIGEN: Expert-Informed Joint Learning Aggregation for High-Fidelity\n Information Extraction from Document Images","summary":" Information Extraction (IE) from document images is challenging due to the\nhigh variability of layout formats. Deep models such as LayoutLM and BROS have\nbeen proposed to address this problem and have shown promising results.\nHowever, they still require a large amount of field-level annotations for\ntraining these models. Other approaches using rule-based methods have also been\nproposed based on the understanding of the layout and semantics of a form such\nas geometric position, or type of the fields, etc. In this work, we propose a\nnovel approach, EIGEN (Expert-Informed Joint Learning aGgrEatioN), which\ncombines rule-based methods with deep learning models using data programming\napproaches to circumvent the requirement of annotation of large amounts of\ntraining data. Specifically, EIGEN consolidates weak labels induced from\nmultiple heuristics through generative models and use them along with a small\nnumber of annotated labels to jointly train a deep model. In our framework, we\npropose the use of labeling functions that include incorporating contextual\ninformation thus capturing the visual and language context of a word for\naccurate categorization. We empirically show that our EIGEN framework can\nsignificantly improve the performance of state-of-the-art deep models with the\navailability of very few labeled data instances. The source code is available\nat\nhttps://github.com/ayushayush591/EIGEN-High-Fidelity-Extraction-Document-Images.\n","authors":["Abhishek Singh","Venkatapathy Subramanian","Ayush Maheshwari","Pradeep Narayan","Devi Prasad Shetty","Ganesh Ramakrishnan"],"pdf_url":"https://arxiv.org/pdf/2311.13993v1.pdf","comment":"In Proceedings of ML for Health Conference, 2023 (co-located with\n Neurips)"},{"id":"http://arxiv.org/abs/2311.13986v1","updated":"2023-11-23T13:07:21Z","published":"2023-11-23T13:07:21Z","title":"FViT-Grasp: Grasping Objects With Using Fast Vision Transformers","summary":" This study addresses the challenge of manipulation, a prominent issue in\nrobotics. We have devised a novel methodology for swiftly and precisely\nidentifying the optimal grasp point for a robot to manipulate an object. Our\napproach leverages a Fast Vision Transformer (FViT), a type of neural network\ndesigned for processing visual data and predicting the most suitable grasp\nlocation. Demonstrating state-of-the-art performance in terms of speed while\nmaintaining a high level of accuracy, our method holds promise for potential\ndeployment in real-time robotic grasping applications. We believe that this\nstudy provides a baseline for future research in vision-based robotic grasp\napplications. Its high speed and accuracy bring researchers closer to real-life\napplications.\n","authors":["Arda Sarp Yenicesu","Berk Cicek","Ozgur S. Oguz"],"pdf_url":"https://arxiv.org/pdf/2311.13986v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11952v2","updated":"2023-11-23T12:54:39Z","published":"2022-05-24T10:32:32Z","title":"3D helical CT Reconstruction with a Memory Efficient Learned Primal-Dual\n Architecture","summary":" Deep learning based computed tomography (CT) reconstruction has demonstrated\noutstanding performance on simulated 2D low-dose CT data. This applies in\nparticular to domain adapted neural networks, which incorporate a handcrafted\nphysics model for CT imaging. Empirical evidence shows that employing such\narchitectures reduces the demand for training data and improves upon\ngeneralisation. However, their training requires large computational resources\nthat quickly become prohibitive in 3D helical CT, which is the most common\nacquisition geometry used for medical imaging. Furthermore, clinical data also\ncomes with other challenges not accounted for in simulations, like errors in\nflux measurement, resolution mismatch and, most importantly, the absence of the\nreal ground truth. The necessity to have a computationally feasible training\ncombined with the need to address these issues has made it difficult to\nevaluate deep learning based reconstruction on clinical 3D helical CT. This\npaper modifies a domain adapted neural network architecture, the Learned\nPrimal-Dual (LPD), so that it can be trained and applied to reconstruction in\nthis setting. We achieve this by splitting the helical trajectory into sections\nand applying the unrolled LPD iterations to those sections sequentially. To the\nbest of our knowledge, this work is the first to apply an unrolled deep\nlearning architecture for reconstruction on full-sized clinical data, like\nthose in the Low dose CT image and projection data set (LDCT). Moreover,\ntraining and testing is done on a single GPU card with 24GB of memory.\n","authors":["Jevgenija Rudzusika","Buda Bajić","Thomas Koehler","Ozan Öktem"],"pdf_url":"https://arxiv.org/pdf/2205.11952v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13976v1","updated":"2023-11-23T12:42:52Z","published":"2023-11-23T12:42:52Z","title":"Low Latency Instance Segmentation by Continuous Clustering for Rotating\n LiDAR Sensors","summary":" Low-latency instance segmentation of LiDAR point clouds is crucial in\nreal-world applications because it serves as an initial and frequently-used\nbuilding block in a robot's perception pipeline, where every task adds further\ndelay. Particularly in dynamic environments, this total delay can result in\nsignificant positional offsets of dynamic objects, as seen in highway\nscenarios. To address this issue, we employ continuous clustering of obstacle\npoints in order to obtain an instance-segmented point cloud. Unlike most\nexisting approaches, which use a full revolution of the LiDAR sensor, we\nprocess the data stream in a continuous and seamless fashion. More\nspecifically, each column of a range image is processed as soon it is\navailable. Obstacle points are clustered to existing instances in real-time and\nit is checked at a high-frequency which instances are completed and are ready\nto be published. An additional advantage is that no problematic discontinuities\nbetween the points of the start and the end of a scan are observed. In this\nwork we describe the two-layered data structure and the corresponding algorithm\nfor continuous clustering, which is able to cluster the incoming data in real\ntime. We explain the importance of a large perceptive field of view.\nFurthermore, we describe and evaluate important architectural design choices,\nwhich could be relevant to design an architecture for deep learning based\nlow-latency instance segmentation. We are publishing the source code at\nhttps://github.com/UniBwTAS/continuous_clustering.\n","authors":["Andreas Reich","Hans-Joachim Wuensche"],"pdf_url":"https://arxiv.org/pdf/2311.13976v1.pdf","comment":"Accompanying Video: https://www.youtube.com/watch?v=DZKuAQBngNE"},{"id":"http://arxiv.org/abs/2207.11209v4","updated":"2023-11-23T12:40:56Z","published":"2022-07-22T17:19:00Z","title":"Divide and Conquer: 3D Point Cloud Instance Segmentation With Point-Wise\n Binarization","summary":" Instance segmentation on point clouds is crucially important for 3D scene\nunderstanding. Most SOTAs adopt distance clustering, which is typically\neffective but does not perform well in segmenting adjacent objects with the\nsame semantic label (especially when they share neighboring points). Due to the\nuneven distribution of offset points, these existing methods can hardly cluster\nall instance points. To this end, we design a novel divide-and-conquer strategy\nnamed PBNet that binarizes each point and clusters them separately to segment\ninstances. Our binary clustering divides offset instance points into two\ncategories: high and low density points (HPs vs. LPs). Adjacent objects can be\nclearly separated by removing LPs, and then be completed and refined by\nassigning LPs via a neighbor voting method. To suppress potential\nover-segmentation, we propose to construct local scenes with the weight mask\nfor each instance. As a plug-in, the proposed binary clustering can replace\ntraditional distance clustering and lead to consistent performance gains on\nmany mainstream baselines. A series of experiments on ScanNetV2 and S3DIS\ndatasets indicate the superiority of our model. In particular, PBNet ranks\nfirst on the ScanNetV2 official benchmark challenge, achieving the highest mAP.\nCode will be available publicly at https://github.com/weiguangzhao/PBNet.\n","authors":["Weiguang Zhao","Yuyao Yan","Chaolong Yang","Jianan Ye","Xi Yang","Kaizhu Huang"],"pdf_url":"https://arxiv.org/pdf/2207.11209v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13964v1","updated":"2023-11-23T12:26:08Z","published":"2023-11-23T12:26:08Z","title":"Deep Interactive Segmentation of Medical Images: A Systematic Review and\n Taxonomy","summary":" Interactive segmentation is a crucial research area in medical image analysis\naiming to boost the efficiency of costly annotations by incorporating human\nfeedback. This feedback takes the form of clicks, scribbles, or masks and\nallows for iterative refinement of the model output so as to efficiently guide\nthe system towards the desired behavior. In recent years, deep learning-based\napproaches have propelled results to a new level causing a rapid growth in the\nfield with 121 methods proposed in the medical imaging domain alone. In this\nreview, we provide a structured overview of this emerging field featuring a\ncomprehensive taxonomy, a systematic review of existing methods, and an\nin-depth analysis of current practices. Based on these contributions, we\ndiscuss the challenges and opportunities in the field. For instance, we find\nthat there is a severe lack of comparison across methods which needs to be\ntackled by standardized baselines and benchmarks.\n","authors":["Zdravko Marinov","Paul F. Jäger","Jan Egger","Jens Kleesiek","Rainer Stiefelhagen"],"pdf_url":"https://arxiv.org/pdf/2311.13964v1.pdf","comment":"26 pages, 8 figures, 10 tables; Zdravko Marinov and Paul F. J\\\"ager\n and co-first authors; This work has been submitted to the IEEE for possible\n publication. Copyright may be transferred without notice, after which this\n version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.13963v1","updated":"2023-11-23T12:24:02Z","published":"2023-11-23T12:24:02Z","title":"Investigating the use of publicly available natural videos to learn\n Dynamic MR image reconstruction","summary":" Purpose: To develop and assess a deep learning (DL) pipeline to learn dynamic\nMR image reconstruction from publicly available natural videos (Inter4K).\n Materials and Methods: Learning was performed for a range of DL architectures\n(VarNet, 3D UNet, FastDVDNet) and corresponding sampling patterns (Cartesian,\nradial, spiral) either from true multi-coil cardiac MR data (N=692) or from\npseudo-MR data simulated from Inter4K natural videos (N=692). Real-time\nundersampled dynamic MR images were reconstructed using DL networks trained\nwith cardiac data and natural videos, and compressed sensing (CS). Differences\nwere assessed in simulations (N=104 datasets) in terms of MSE, PSNR, and SSIM\nand prospectively for cardiac (short axis, four chambers, N=20) and speech\n(N=10) data in terms of subjective image quality ranking, SNR and Edge\nsharpness. Friedman Chi Square tests with post-hoc Nemenyi analysis were\nperformed to assess statistical significance.\n Results: For all simulation metrics, DL networks trained with cardiac data\noutperformed DL networks trained with natural videos, which outperformed CS\n(p<0.05). However, in prospective experiments DL reconstructions using both\ntraining datasets were ranked similarly (and higher than CS) and presented no\nstatistical differences in SNR and Edge Sharpness for most conditions.\nAdditionally, high SSIM was measured between the DL methods with cardiac data\nand natural videos (SSIM>0.85).\n Conclusion: The developed pipeline enabled learning dynamic MR reconstruction\nfrom natural videos preserving DL reconstruction advantages such as high\nquality fast and ultra-fast reconstructions while overcoming some limitations\n(data scarcity or sharing). The natural video dataset, code and pre-trained\nnetworks are made readily available on github.\n Key Words: real-time; dynamic MRI; deep learning; image reconstruction;\nmachine learning;\n","authors":["Olivier Jaubert","Michele Pascale","Javier Montalt-Tordera","Julius Akesson","Ruta Virsinskaite","Daniel Knight","Simon Arridge","Jennifer Steeden","Vivek Muthurangu"],"pdf_url":"https://arxiv.org/pdf/2311.13963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13959v1","updated":"2023-11-23T12:17:45Z","published":"2023-11-23T12:17:45Z","title":"RankFeat\\&RankWeight: Rank-1 Feature/Weight Removal for\n Out-of-distribution Detection","summary":" The task of out-of-distribution (OOD) detection is crucial for deploying\nmachine learning models in real-world settings. In this paper, we observe that\nthe singular value distributions of the in-distribution (ID) and OOD features\nare quite different: the OOD feature matrix tends to have a larger dominant\nsingular value than the ID feature, and the class predictions of OOD samples\nare largely determined by it. This observation motivates us to propose\n\\texttt{RankFeat}, a simple yet effective \\emph{post hoc} approach for OOD\ndetection by removing the rank-1 matrix composed of the largest singular value\nand the associated singular vectors from the high-level feature.\n\\texttt{RankFeat} achieves \\emph{state-of-the-art} performance and reduces the\naverage false positive rate (FPR95) by 17.90\\% compared with the previous best\nmethod. The success of \\texttt{RankFeat} motivates us to investigate whether a\nsimilar phenomenon would exist in the parameter matrices of neural networks. We\nthus propose \\texttt{RankWeight} which removes the rank-1 weight from the\nparameter matrices of a single deep layer. Our \\texttt{RankWeight}is also\n\\emph{post hoc} and only requires computing the rank-1 matrix once. As a\nstandalone approach, \\texttt{RankWeight} has very competitive performance\nagainst other methods across various backbones. Moreover, \\texttt{RankWeight}\nenjoys flexible compatibility with a wide range of OOD detection methods. The\ncombination of \\texttt{RankWeight} and \\texttt{RankFeat} refreshes the new\n\\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\\% on\nthe ImageNet-1k benchmark. Extensive ablation studies and comprehensive\ntheoretical analyses are presented to support the empirical results.\n","authors":["Yue Song","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13959v1.pdf","comment":"submitted to T-PAMI"},{"id":"http://arxiv.org/abs/2311.13958v1","updated":"2023-11-23T12:16:33Z","published":"2023-11-23T12:16:33Z","title":"High-Order Tensor Recovery with A Tensor $U_1$ Norm","summary":" Recently, numerous tensor SVD (t-SVD)-based tensor recovery methods have\nemerged, showing promise in processing visual data. However, these methods\noften suffer from performance degradation when confronted with high-order\ntensor data exhibiting non-smooth changes, commonly observed in real-world\nscenarios but ignored by the traditional t-SVD-based methods. Our objective in\nthis study is to provide an effective tensor recovery technique for handling\nnon-smooth changes in tensor data and efficiently explore the correlations of\nhigh-order tensor data across its various dimensions without introducing\nnumerous variables and weights. To this end, we introduce a new tensor\ndecomposition and a new tensor norm called the Tensor $U_1$ norm. We utilize\nthese novel techniques in solving the problem of high-order tensor completion\nproblem and provide theoretical guarantees for the exact recovery of the\nresulting tensor completion models. An optimization algorithm is proposed to\nsolve the resulting tensor completion model iteratively by combining the\nproximal algorithm with the Alternating Direction Method of Multipliers.\nTheoretical analysis showed the convergence of the algorithm to the\nKarush-Kuhn-Tucker (KKT) point of the optimization problem. Numerical\nexperiments demonstrated the effectiveness of the proposed method in high-order\ntensor completion, especially for tensor data with non-smooth changes.\n","authors":["Jingjing Zheng","Wenzhe Wang","Xiaoqin Zhang","Yankai Cao","Xianta Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.13958v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13954v1","updated":"2023-11-23T12:09:49Z","published":"2023-11-23T12:09:49Z","title":"Electric Network Frequency Optical Sensing Devices","summary":" Electric Network Frequency (ENF) acts as a fingerprint in multimedia\nforensics applications. In indoor environments, ENF variations affect the\nintensity of light sources connected to power mains. Accordingly, the light\nintensity variations captured by sensing devices can be exploited to estimate\nthe ENF. A first optical sensing device based on a photodiode is developed for\ncapturing ENF variations in indoor lighting environments. In addition, a device\nthat captures the ENF directly from power mains is implemented. This device\nserves as a ground truth ENF collector. Video recordings captured by a camera\nare also employed to estimate the ENF. The camera serves as a second optical\nsensor. The factors affecting the ENF estimation are thoroughly studied. The\nmaximum correlation coefficient between the ENF estimated by the two optical\nsensors and that estimated directly from power mains is used to measure the\nestimation accuracy. The paper's major contribution is in the disclosure of\nextensive experimental evidence on ENF estimation in scenes ranging from static\nones capturing a white wall to non-static ones, including human activity.\n","authors":["Christos Moysiadis","Georgios Karantaidis","Constantine Kotropoulos"],"pdf_url":"https://arxiv.org/pdf/2311.13954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13231v2","updated":"2023-11-23T11:56:46Z","published":"2023-11-22T08:42:46Z","title":"Using Human Feedback to Fine-tune Diffusion Models without Any Reward\n Model","summary":" Using reinforcement learning with human feedback (RLHF) has shown significant\npromise in fine-tuning diffusion models. Previous methods start by training a\nreward model that aligns with human preferences, then leverage RL techniques to\nfine-tune the underlying models. However, crafting an efficient reward model\ndemands extensive datasets, optimal architecture, and manual hyperparameter\ntuning, making the process both time and cost-intensive. The direct preference\noptimization (DPO) method, effective in fine-tuning large language models,\neliminates the necessity for a reward model. However, the extensive GPU memory\nrequirement of the diffusion model's denoising process hinders the direct\napplication of the DPO method. To address this issue, we introduce the Direct\nPreference for Denoising Diffusion Policy Optimization (D3PO) method to\ndirectly fine-tune diffusion models. The theoretical analysis demonstrates that\nalthough D3PO omits training a reward model, it effectively functions as the\noptimal reward model trained using human feedback data to guide the learning\nprocess. This approach requires no training of a reward model, proving to be\nmore direct, cost-effective, and minimizing computational overhead. In\nexperiments, our method uses the relative scale of objectives as a proxy for\nhuman preference, delivering comparable results to methods using ground-truth\nrewards. Moreover, D3PO demonstrates the ability to reduce image distortion\nrates and generate safer images, overcoming challenges lacking robust reward\nmodels. Our code is publicly available in\nhttps://github.com/yk7333/D3PO/tree/main.\n","authors":["Kai Yang","Jian Tao","Jiafei Lyu","Chunjiang Ge","Jiaxin Chen","Qimai Li","Weihan Shen","Xiaolong Zhu","Xiu Li"],"pdf_url":"https://arxiv.org/pdf/2311.13231v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17389v3","updated":"2023-11-23T11:51:00Z","published":"2023-09-29T16:50:38Z","title":"Prompt-based test-time real image dehazing: a novel pipeline","summary":" Existing methods attempt to improve models' generalization ability on\nreal-world hazy images by exploring well-designed training schemes (e.g.,\nCycleGAN, prior loss). However, most of them need very complicated training\nprocedures to achieve satisfactory results. In this work, we present a totally\nnovel testing pipeline called Prompt-based Test-Time Dehazing (PTTD) to help\ngenerate visually pleasing results of real-captured hazy images during the\ninference phase. We experimentally find that given a dehazing model trained on\nsynthetic data, by fine-tuning the statistics (i.e., mean and standard\ndeviation) of encoding features, PTTD is able to narrow the domain gap,\nboosting the performance of real image dehazing. Accordingly, we first apply a\nprompt generation module (PGM) to generate a visual prompt, which is the source\nof appropriate statistical perturbations for mean and standard deviation. And\nthen, we employ the feature adaptation module (FAM) into the existing dehazing\nmodels for adjusting the original statistics with the guidance of the generated\nprompt. Note that, PTTD is model-agnostic and can be equipped with various\nstate-of-the-art dehazing models trained on synthetic hazy-clean pairs.\nExtensive experimental results demonstrate that our PTTD is flexible meanwhile\nachieves superior performance against state-of-the-art dehazing methods in\nreal-world scenarios. The source code of our PTTD will be made available at\nhttps://github.com/cecret3350/PTTD-Dehazing.\n","authors":["Zixuan Chen","Zewei He","Ziqian Lu","Xuecheng Sun","Zhe-Ming Lu"],"pdf_url":"https://arxiv.org/pdf/2309.17389v3.pdf","comment":"update github link (https://github.com/cecret3350/PTTD-Dehazing)"},{"id":"http://arxiv.org/abs/2304.05390v2","updated":"2023-11-23T11:45:02Z","published":"2023-04-11T17:59:13Z","title":"HRS-Bench: Holistic, Reliable and Scalable Benchmark for Text-to-Image\n Models","summary":" In recent years, Text-to-Image (T2I) models have been extensively studied,\nespecially with the emergence of diffusion models that achieve state-of-the-art\nresults on T2I synthesis tasks. However, existing benchmarks heavily rely on\nsubjective human evaluation, limiting their ability to holistically assess the\nmodel's capabilities. Furthermore, there is a significant gap between efforts\nin developing new T2I architectures and those in evaluation. To address this,\nwe introduce HRS-Bench, a concrete evaluation benchmark for T2I models that is\nHolistic, Reliable, and Scalable. Unlike existing bench-marks that focus on\nlimited aspects, HRS-Bench measures 13 skills that can be categorized into five\nmajor categories: accuracy, robustness, generalization, fairness, and bias. In\naddition, HRS-Bench covers 50 scenarios, including fashion, animals,\ntransportation, food, and clothes. We evaluate nine recent large-scale T2I\nmodels using metrics that cover a wide range of skills. A human evaluation\naligned with 95% of our evaluations on average was conducted to probe the\neffectiveness of HRS-Bench. Our experiments demonstrate that existing models\noften struggle to generate images with the desired count of objects, visual\ntext, or grounded emotions. We hope that our benchmark help ease future\ntext-to-image generation research. The code and data are available at\nhttps://eslambakr.github.io/hrsbench.github.io\n","authors":["Eslam Mohamed Bakr","Pengzhan Sun","Xiaoqian Shen","Faizan Farooq Khan","Li Erran Li","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2304.05390v2.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2311.13384v2","updated":"2023-11-23T11:40:16Z","published":"2023-11-22T13:27:34Z","title":"LucidDreamer: Domain-free Generation of 3D Gaussian Splatting Scenes","summary":" With the widespread usage of VR devices and contents, demands for 3D scene\ngeneration techniques become more popular. Existing 3D scene generation models,\nhowever, limit the target scene to specific domain, primarily due to their\ntraining strategies using 3D scan dataset that is far from the real-world. To\naddress such limitation, we propose LucidDreamer, a domain-free scene\ngeneration pipeline by fully leveraging the power of existing large-scale\ndiffusion-based generative model. Our LucidDreamer has two alternate steps:\nDreaming and Alignment. First, to generate multi-view consistent images from\ninputs, we set the point cloud as a geometrical guideline for each image\ngeneration. Specifically, we project a portion of point cloud to the desired\nview and provide the projection as a guidance for inpainting using the\ngenerative model. The inpainted images are lifted to 3D space with estimated\ndepth maps, composing a new points. Second, to aggregate the new points into\nthe 3D scene, we propose an aligning algorithm which harmoniously integrates\nthe portions of newly generated 3D scenes. The finally obtained 3D scene serves\nas initial points for optimizing Gaussian splats. LucidDreamer produces\nGaussian splats that are highly-detailed compared to the previous 3D scene\ngeneration methods, with no constraint on domain of the target scene. Project\npage: https://luciddreamer-cvlab.github.io/\n","authors":["Jaeyoung Chung","Suyoung Lee","Hyeongjin Nam","Jaerin Lee","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2311.13384v2.pdf","comment":"Project page: https://luciddreamer-cvlab.github.io/"},{"id":"http://arxiv.org/abs/2311.13934v1","updated":"2023-11-23T11:34:48Z","published":"2023-11-23T11:34:48Z","title":"Robustness-Reinforced Knowledge Distillation with Correlation Distance\n and Network Pruning","summary":" The improvement in the performance of efficient and lightweight models (i.e.,\nthe student model) is achieved through knowledge distillation (KD), which\ninvolves transferring knowledge from more complex models (i.e., the teacher\nmodel). However, most existing KD techniques rely on Kullback-Leibler (KL)\ndivergence, which has certain limitations. First, if the teacher distribution\nhas high entropy, the KL divergence's mode-averaging nature hinders the\ntransfer of sufficient target information. Second, when the teacher\ndistribution has low entropy, the KL divergence tends to excessively focus on\nspecific modes, which fails to convey an abundant amount of valuable knowledge\nto the student. Consequently, when dealing with datasets that contain numerous\nconfounding or challenging samples, student models may struggle to acquire\nsufficient knowledge, resulting in subpar performance. Furthermore, in previous\nKD approaches, we observed that data augmentation, a technique aimed at\nenhancing a model's generalization, can have an adverse impact. Therefore, we\npropose a Robustness-Reinforced Knowledge Distillation (R2KD) that leverages\ncorrelation distance and network pruning. This approach enables KD to\neffectively incorporate data augmentation for performance improvement.\nExtensive experiments on various datasets, including CIFAR-100, FGVR,\nTinyImagenet, and ImageNet, demonstrate our method's superiority over current\nstate-of-the-art methods.\n","authors":["Seonghak Kim","Gyeongdo Ham","Yucheol Cho","Daeshik Kim"],"pdf_url":"https://arxiv.org/pdf/2311.13934v1.pdf","comment":"11 pages, 7 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.13930v1","updated":"2023-11-23T11:30:54Z","published":"2023-11-23T11:30:54Z","title":"Periodically Exchange Teacher-Student for Source-Free Object Detection","summary":" Source-free object detection (SFOD) aims to adapt the source detector to\nunlabeled target domain data in the absence of source domain data. Most SFOD\nmethods follow the same self-training paradigm using mean-teacher (MT)\nframework where the student model is guided by only one single teacher model.\nHowever, such paradigm can easily fall into a training instability problem that\nwhen the teacher model collapses uncontrollably due to the domain shift, the\nstudent model also suffers drastic performance degradation. To address this\nissue, we propose the Periodically Exchange Teacher-Student (PETS) method, a\nsimple yet novel approach that introduces a multiple-teacher framework\nconsisting of a static teacher, a dynamic teacher, and a student model. During\nthe training phase, we periodically exchange the weights between the static\nteacher and the student model. Then, we update the dynamic teacher using the\nmoving average of the student model that has already been exchanged by the\nstatic teacher. In this way, the dynamic teacher can integrate knowledge from\npast periods, effectively reducing error accumulation and enabling a more\nstable training process within the MT-based framework. Further, we develop a\nconsensus mechanism to merge the predictions of two teacher models to provide\nhigher-quality pseudo labels for student model. Extensive experiments on\nmultiple SFOD benchmarks show that the proposed method achieves\nstate-of-the-art performance compared with other related methods, demonstrating\nthe effectiveness and superiority of our method on SFOD task.\n","authors":["Qipeng Liu","Luojun Lin","Zhifeng Shen","Zhifeng Yang"],"pdf_url":"https://arxiv.org/pdf/2311.13930v1.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2311.13929v1","updated":"2023-11-23T11:30:02Z","published":"2023-11-23T11:30:02Z","title":"MetaFBP: Learning to Learn High-Order Predictor for Personalized Facial\n Beauty Prediction","summary":" Predicting individual aesthetic preferences holds significant practical\napplications and academic implications for human society. However, existing\nstudies mainly focus on learning and predicting the commonality of facial\nattractiveness, with little attention given to Personalized Facial Beauty\nPrediction (PFBP). PFBP aims to develop a machine that can adapt to individual\naesthetic preferences with only a few images rated by each user. In this paper,\nwe formulate this task from a meta-learning perspective that each user\ncorresponds to a meta-task. To address such PFBP task, we draw inspiration from\nthe human aesthetic mechanism that visual aesthetics in society follows a\nGaussian distribution, which motivates us to disentangle user preferences into\na commonality and an individuality part. To this end, we propose a novel\nMetaFBP framework, in which we devise a universal feature extractor to capture\nthe aesthetic commonality and then optimize to adapt the aesthetic\nindividuality by shifting the decision boundary of the predictor via a\nmeta-learning mechanism. Unlike conventional meta-learning methods that may\nstruggle with slow adaptation or overfitting to tiny support sets, we propose a\nnovel approach that optimizes a high-order predictor for fast adaptation. In\norder to validate the performance of the proposed method, we build several PFBP\nbenchmarks by using existing facial beauty prediction datasets rated by\nnumerous users. Extensive experiments on these benchmarks demonstrate the\neffectiveness of the proposed MetaFBP method.\n","authors":["Luojun Lin","Zhifeng Shen","Jia-Li Yin","Qipeng Liu","Yuanlong Yu","Weijie Chen"],"pdf_url":"https://arxiv.org/pdf/2311.13929v1.pdf","comment":"Accepted by ACM MM 2023. Source code:\n https://github.com/MetaVisionLab/MetaFBP"},{"id":"http://arxiv.org/abs/2311.13928v1","updated":"2023-11-23T11:29:16Z","published":"2023-11-23T11:29:16Z","title":"Parameter Exchange for Robust Dynamic Domain Generalization","summary":" Agnostic domain shift is the main reason of model degradation on the unknown\ntarget domains, which brings an urgent need to develop Domain Generalization\n(DG). Recent advances at DG use dynamic networks to achieve training-free\nadaptation on the unknown target domains, termed Dynamic Domain Generalization\n(DDG), which compensates for the lack of self-adaptability in static models\nwith fixed weights. The parameters of dynamic networks can be decoupled into a\nstatic and a dynamic component, which are designed to learn domain-invariant\nand domain-specific features, respectively. Based on the existing arts, in this\nwork, we try to push the limits of DDG by disentangling the static and dynamic\ncomponents more thoroughly from an optimization perspective. Our main\nconsideration is that we can enable the static component to learn\ndomain-invariant features more comprehensively by augmenting the\ndomain-specific information. As a result, the more comprehensive\ndomain-invariant features learned by the static component can then enforce the\ndynamic component to focus more on learning adaptive domain-specific features.\nTo this end, we propose a simple yet effective Parameter Exchange (PE) method\nto perturb the combination between the static and dynamic components. We\noptimize the model using the gradients from both the perturbed and\nnon-perturbed feed-forward jointly to implicitly achieve the aforementioned\ndisentanglement. In this way, the two components can be optimized in a\nmutually-beneficial manner, which can resist the agnostic domain shifts and\nimprove the self-adaptability on the unknown target domain. Extensive\nexperiments show that PE can be easily plugged into existing dynamic networks\nto improve their generalization ability without bells and whistles.\n","authors":["Luojun Lin","Zhifeng Shen","Zhishu Sun","Yuanlong Yu","Lei Zhang","Weijie Chen"],"pdf_url":"https://arxiv.org/pdf/2311.13928v1.pdf","comment":"Accepted by ACM MM 2023. Source code:\n https://github.com/MetaVisionLab/PE"},{"id":"http://arxiv.org/abs/2311.13925v1","updated":"2023-11-23T11:21:40Z","published":"2023-11-23T11:21:40Z","title":"Predicting Recovery or Decease of COVID-19 Patients with Clinical and\n RT-PCR Using Machine Learning Classification Algorithms","summary":" The COVID-19 pandemic has disrupted the global economy and people's daily\nlives in unprecedented ways. To make appropriate decisions, it is necessary to\ndiagnose COVID-19 rapidly and accurately. Clinical decision making is\ninfluenced by data collected from patients. With the aid of artificial\nintelligence, COVID-19 has been diagnosed quickly by analyzing symptoms,\npolymerase chain reaction (PCR), computed tomography scans, chest X-rays,\nroutine laboratory blood tests and even cough sounds. Furthermore, these data\ncan be used to predict a patient's morality, although there is a question about\nwhich data makes the most accurate predictions. Therefore, this study consists\nof two parts. Our first objective is to examine whether machine learning\nalgorithms can predict the outcome of COVID-19 cases (recovery or death), based\non the features present in the dataset. In the second part of the research, we\ninvestigated the impact of clinical and RT-PCR on prediction of recovery and\ndecease to determine which one is more reliable. We defined four stages with\ndifferent feature sets and use six machine learning methods to build prediction\nmodel. With an accuracy of 78.7%, random forest showed promising results for\npredicting death and recovery of patients. Based on this, it appears that\nrecovery and decease of patients are predictable using machine learning. For\nsecond objective, results indicate that clinical alone (without using RT-PCR),\ntrained with AdaBoost algorithm, is the most accurate with an accuracy of\n82.1%. This study can provide guidance for medical professionals in the event\nof a crisis or outbreak similar to COVID-19.\n","authors":["Mohammad Dehghani","Zahra Yazdanparast"],"pdf_url":"https://arxiv.org/pdf/2311.13925v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06214v2","updated":"2023-11-23T11:04:39Z","published":"2023-10-10T00:07:25Z","title":"CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding","summary":" 3D visual grounding is the ability to localize objects in 3D scenes\nconditioned by utterances. Most existing methods devote the referring head to\nlocalize the referred object directly, causing failure in complex scenarios. In\naddition, it does not illustrate how and why the network reaches the final\ndecision. In this paper, we address this question Can we design an\ninterpretable 3D visual grounding framework that has the potential to mimic the\nhuman perception system?. To this end, we formulate the 3D visual grounding\nproblem as a sequence-to-sequence task by first predicting a chain of anchors\nand then the final target. Interpretability not only improves the overall\nperformance but also helps us identify failure cases. Following the chain of\nthoughts approach enables us to decompose the referring task into interpretable\nintermediate steps, boosting the performance and making our framework extremely\ndata-efficient. Moreover, our proposed framework can be easily integrated into\nany existing architecture. We validate our approach through comprehensive\nexperiments on the Nr3D, Sr3D, and Scanrefer benchmarks and show consistent\nperformance gains compared to existing methods without requiring manually\nannotated data. Furthermore, our proposed framework, dubbed CoT3DRef, is\nsignificantly data-efficient, whereas on the Sr3D dataset, when trained only on\n10% of the data, we match the SOTA performance that trained on the entire data.\n","authors":["Eslam Mohamed Bakr","Mohamed Ayman","Mahmoud Ahmed","Habib Slim","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2310.06214v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13912v1","updated":"2023-11-23T11:01:35Z","published":"2023-11-23T11:01:35Z","title":"Expanding the deep-learning model to diagnosis LVNC: Limitations and\n trade-offs","summary":" Hyper-trabeculation or non-compaction in the left ventricle of the myocardium\n(LVNC) is a recently classified form of cardiomyopathy. Several methods have\nbeen proposed to quantify the trabeculae accurately in the left ventricle, but\nthere is no general agreement in the medical community to use a particular\napproach. In previous work, we proposed DL-LVTQ, a deep learning approach for\nleft ventricular trabecular quantification based on a U-Net CNN architecture.\nDL-LVTQ was an automatic diagnosis tool developed from a dataset of patients\nwith the same cardiomyopathy (hypertrophic cardiomyopathy).\n In this work, we have extended and adapted DL-LVTQ to cope with patients with\ndifferent cardiomyopathies. The dataset consists of up 379 patients in three\ngroups with different particularities and cardiomyopathies. Patient images were\ntaken from different scanners and hospitals. We have modified and adapted the\nU-Net convolutional neural network to account for the different particularities\nof a heterogeneous group of patients with various unclassifiable or mixed and\ninherited cardiomyopathies.\n The inclusion of new groups of patients has increased the accuracy,\nspecificity and kappa values while maintaining the sensitivity of the automatic\ndeep learning method proposed. Therefore, a better-prepared diagnosis tool is\nready for various cardiomyopathies with different characteristics.\nCardiologists have considered that 98.9% of the evaluated outputs are verified\nclinically for diagnosis. Therefore, the high precision to segment the\ndifferent cardiac structures allows us to make a robust diagnostic system\nobjective and faster, decreasing human error and time spent.\n","authors":["Gregorio Bernabé","Pilar González-Férez","José M. García","Guillem Casas","Josefa González-Carrillo"],"pdf_url":"https://arxiv.org/pdf/2311.13912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13895v1","updated":"2023-11-23T10:26:36Z","published":"2023-11-23T10:26:36Z","title":"Query by Activity Video in the Wild","summary":" This paper focuses on activity retrieval from a video query in an imbalanced\nscenario. In current query-by-activity-video literature, a common assumption is\nthat all activities have sufficient labelled examples when learning an\nembedding. This assumption does however practically not hold, as only a portion\nof activities have many examples, while other activities are only described by\nfew examples. In this paper, we propose a visual-semantic embedding network\nthat explicitly deals with the imbalanced scenario for activity retrieval. Our\nnetwork contains two novel modules. The visual alignment module performs a\nglobal alignment between the input video and fixed-sized visual bank\nrepresentations for all activities. The semantic module performs an alignment\nbetween the input video and fixed-sized semantic activity representations. By\nmatching videos with both visual and semantic activity representations that are\nof equal size over all activities, we no longer ignore infrequent activities\nduring retrieval. Experiments on a new imbalanced activity retrieval benchmark\nshow the effectiveness of our approach for all types of activities.\n","authors":["Tao Hu","William Thong","Pascal Mettes","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2311.13895v1.pdf","comment":"An extended version of ICIP 2023"},{"id":"http://arxiv.org/abs/2204.11291v2","updated":"2023-11-23T10:16:54Z","published":"2022-04-24T14:39:47Z","title":"Large Scale Time-Series Representation Learning via Simultaneous Low and\n High Frequency Feature Bootstrapping","summary":" Learning representation from unlabeled time series data is a challenging\nproblem. Most existing self-supervised and unsupervised approaches in the\ntime-series domain do not capture low and high-frequency features at the same\ntime. Further, some of these methods employ large scale models like\ntransformers or rely on computationally expensive techniques such as\ncontrastive learning. To tackle these problems, we propose a non-contrastive\nself-supervised learning approach efficiently captures low and high-frequency\ntime-varying features in a cost-effective manner. Our method takes raw time\nseries data as input and creates two different augmented views for two branches\nof the model, by randomly sampling the augmentations from same family.\nFollowing the terminology of BYOL, the two branches are called online and\ntarget network which allows bootstrapping of the latent representation. In\ncontrast to BYOL, where a backbone encoder is followed by multilayer perceptron\n(MLP) heads, the proposed model contains additional temporal convolutional\nnetwork (TCN) heads. As the augmented views are passed through large kernel\nconvolution blocks of the encoder, the subsequent combination of MLP and TCN\nenables an effective representation of low as well as high-frequency\ntime-varying features due to the varying receptive fields. The two modules (MLP\nand TCN) act in a complementary manner. We train an online network where each\nmodule learns to predict the outcome of the respective module of target network\nbranch. To demonstrate the robustness of our model we performed extensive\nexperiments and ablation studies on five real-world time-series datasets. Our\nmethod achieved state-of-art performance on all five real-world datasets.\n","authors":["Vandan Gorade","Azad Singh","Deepak Mishra"],"pdf_url":"https://arxiv.org/pdf/2204.11291v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13880v1","updated":"2023-11-23T10:05:31Z","published":"2023-11-23T10:05:31Z","title":"PointPCA+: Extending PointPCA objective quality assessment metric","summary":" A computationally-simplified and descriptor-richer Point Cloud Quality\nAssessment (PCQA) metric, namely PointPCA+, is proposed in this paper, which is\nan extension of PointPCA. PointPCA proposed a set of perceptually-relevant\ndescriptors based on PCA decomposition that were applied to both the geometry\nand texture data of point clouds for full reference PCQA. PointPCA+ employs PCA\nonly on the geometry data while enriching existing geometry and texture\ndescriptors, that are computed more efficiently. Similarly to PointPCA, a total\nquality score is obtained through a learning-based fusion of individual\npredictions from geometry and texture descriptors that capture local shape and\nappearance properties, respectively. Before feature fusion, a feature selection\nmodule is introduced to choose the most effective features from a proposed\nsuper-set. Experimental results show that PointPCA+ achieves high predictive\nperformance against subjective ground truth scores obtained from publicly\navailable datasets. The code is available at\n\\url{https://github.com/cwi-dis/pointpca_suite/}.\n","authors":["Xuemei Zhou","Evangelos Alexiou","Irene Viola","Pablo Cesar"],"pdf_url":"https://arxiv.org/pdf/2311.13880v1.pdf","comment":"ICIP 2023"},{"id":"http://arxiv.org/abs/2311.12553v2","updated":"2023-11-23T09:10:46Z","published":"2023-11-21T12:05:56Z","title":"\"HoVer-UNet\": Accelerating HoVerNet with UNet-based multi-class nuclei\n segmentation via knowledge distillation","summary":" We present \"HoVer-UNet\", an approach to distill the knowledge of the\nmulti-branch HoVerNet framework for nuclei instance segmentation and\nclassification in histopathology. We propose a compact, streamlined single UNet\nnetwork with a Mix Vision Transformer backbone, and equip it with a custom loss\nfunction to optimally encode the distilled knowledge of HoVerNet, reducing\ncomputational requirements without compromising performances. We show that our\nmodel achieved results comparable to HoVerNet on the public PanNuke and Consep\ndatasets with a three-fold reduction in inference time. We make the code of our\nmodel publicly available at https://github.com/DIAGNijmegen/HoVer-UNet.\n","authors":["Cristian Tommasino","Cristiano Russo","Antonio Maria Rinaldi","Francesco Ciompi"],"pdf_url":"https://arxiv.org/pdf/2311.12553v2.pdf","comment":"4 pages, 2 figures, submitted to ISBI 2024"},{"id":"http://arxiv.org/abs/2311.13865v1","updated":"2023-11-23T09:08:49Z","published":"2023-11-23T09:08:49Z","title":"Language-guided Few-shot Semantic Segmentation","summary":" Few-shot learning is a promising way for reducing the label cost in new\ncategories adaptation with the guidance of a small, well labeled support set.\nBut for few-shot semantic segmentation, the pixel-level annotations of support\nimages are still expensive. In this paper, we propose an innovative solution to\ntackle the challenge of few-shot semantic segmentation using only language\ninformation, i.e.image-level text labels. Our approach involves a\nvision-language-driven mask distillation scheme, which contains a\nvision-language pretraining (VLP) model and a mask refiner, to generate high\nquality pseudo-semantic masks from text prompts. We additionally introduce a\ndistributed prototype supervision method and complementary correlation matching\nmodule to guide the model in digging precise semantic relations among support\nand query images. The experiments on two benchmark datasets demonstrate that\nour method establishes a new baseline for language-guided few-shot semantic\nsegmentation and achieves competitive results to recent vision-guided methods.\n","authors":["Jing Wang","Yuang Liu","Qiang Zhou","Fan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13865v1.pdf","comment":"Expanded version for a pending ICASSP2024 submission"},{"id":"http://arxiv.org/abs/2311.06214v2","updated":"2023-11-23T08:55:49Z","published":"2023-11-10T18:03:44Z","title":"Instant3D: Fast Text-to-3D with Sparse-View Generation and Large\n Reconstruction Model","summary":" Text-to-3D with diffusion models has achieved remarkable progress in recent\nyears. However, existing methods either rely on score distillation-based\noptimization which suffer from slow inference, low diversity and Janus\nproblems, or are feed-forward methods that generate low-quality results due to\nthe scarcity of 3D training data. In this paper, we propose Instant3D, a novel\nmethod that generates high-quality and diverse 3D assets from text prompts in a\nfeed-forward manner. We adopt a two-stage paradigm, which first generates a\nsparse set of four structured and consistent views from text in one shot with a\nfine-tuned 2D text-to-image diffusion model, and then directly regresses the\nNeRF from the generated images with a novel transformer-based sparse-view\nreconstructor. Through extensive experiments, we demonstrate that our method\ncan generate diverse 3D assets of high visual quality within 20 seconds, which\nis two orders of magnitude faster than previous optimization-based methods that\ncan take 1 to 10 hours. Our project webpage: https://jiahao.ai/instant3d/.\n","authors":["Jiahao Li","Hao Tan","Kai Zhang","Zexiang Xu","Fujun Luan","Yinghao Xu","Yicong Hong","Kalyan Sunkavalli","Greg Shakhnarovich","Sai Bi"],"pdf_url":"https://arxiv.org/pdf/2311.06214v2.pdf","comment":"Project webpage: https://jiahao.ai/instant3d/"},{"id":"http://arxiv.org/abs/2303.01538v2","updated":"2023-11-23T08:50:37Z","published":"2023-03-02T19:05:46Z","title":"Feature Perturbation Augmentation for Reliable Evaluation of Importance\n Estimators in Neural Networks","summary":" Post-hoc explanation methods attempt to make the inner workings of deep\nneural networks more interpretable. However, since a ground truth is in general\nlacking, local post-hoc interpretability methods, which assign importance\nscores to input features, are challenging to evaluate. One of the most popular\nevaluation frameworks is to perturb features deemed important by an\ninterpretability method and to measure the change in prediction accuracy.\nIntuitively, a large decrease in prediction accuracy would indicate that the\nexplanation has correctly quantified the importance of features with respect to\nthe prediction outcome (e.g., logits). However, the change in the prediction\noutcome may stem from perturbation artifacts, since perturbed samples in the\ntest dataset are out of distribution (OOD) compared to the training dataset and\ncan therefore potentially disturb the model in an unexpected manner. To\novercome this challenge, we propose feature perturbation augmentation (FPA)\nwhich creates and adds perturbed images during the model training. Through\nextensive computational experiments, we demonstrate that FPA makes deep neural\nnetworks (DNNs) more robust against perturbations. Furthermore, training DNNs\nwith FPA demonstrate that the sign of importance scores may explain the model\nmore meaningfully than has previously been assumed. Overall, FPA is an\nintuitive data augmentation technique that improves the evaluation of post-hoc\ninterpretability methods.\n","authors":["Lennart Brocki","Neo Christopher Chung"],"pdf_url":"https://arxiv.org/pdf/2303.01538v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13847v1","updated":"2023-11-23T08:31:11Z","published":"2023-11-23T08:31:11Z","title":"Perceptual Image Compression with Cooperative Cross-Modal Side\n Information","summary":" The explosion of data has resulted in more and more associated text being\ntransmitted along with images. Inspired by from distributed source coding, many\nworks utilize image side information to enhance image compression. However,\nexisting methods generally do not consider using text as side information to\nenhance perceptual compression of images, even though the benefits of\nmultimodal synergy have been widely demonstrated in research. This begs the\nfollowing question: How can we effectively transfer text-level semantic\ndependencies to help image compression, which is only available to the decoder?\nIn this work, we propose a novel deep image compression method with text-guided\nside information to achieve a better rate-perception-distortion tradeoff.\nSpecifically, we employ the CLIP text encoder and an effective Semantic-Spatial\nAware block to fuse the text and image features. This is done by predicting a\nsemantic mask to guide the learned text-adaptive affine transformation at the\npixel level. Furthermore, we design a text-conditional generative adversarial\nnetworks to improve the perceptual quality of reconstructed images. Extensive\nexperiments involving four datasets and ten image quality assessment metrics\ndemonstrate that the proposed approach achieves superior results in terms of\nrate-perception trade-off and semantic distortion.\n","authors":["Shiyu Qin","Bin Chen","Yujun Huang","Baoyi An","Tao Dai","Shu-Tao Via"],"pdf_url":"https://arxiv.org/pdf/2311.13847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13846v1","updated":"2023-11-23T08:29:32Z","published":"2023-11-23T08:29:32Z","title":"Progressive Learning with Visual Prompt Tuning for Variable-Rate Image\n Compression","summary":" In this paper, we propose a progressive learning paradigm for\ntransformer-based variable-rate image compression. Our approach covers a wide\nrange of compression rates with the assistance of the Layer-adaptive Prompt\nModule (LPM). Inspired by visual prompt tuning, we use LPM to extract prompts\nfor input images and hidden features at the encoder side and decoder side,\nrespectively, which are fed as additional information into the Swin Transformer\nlayer of a pre-trained transformer-based image compression model to affect the\nallocation of attention region and the bits, which in turn changes the target\ncompression ratio of the model. To ensure the network is more lightweight, we\ninvolves the integration of prompt networks with less convolutional layers.\nExhaustive experiments show that compared to methods based on multiple models,\nwhich are optimized separately for different target rates, the proposed method\narrives at the same performance with 80% savings in parameter storage and 90%\nsavings in datasets. Meanwhile, our model outperforms all current variable\nbitrate image methods in terms of rate-distortion performance and approaches\nthe state-of-the-art fixed bitrate image compression methods trained from\nscratch.\n","authors":["Shiyu Qin","Yimin Zhou","Jinpeng Wang","Bin Chen","Baoyi An","Tao Dai","Shu-Tao Xia"],"pdf_url":"https://arxiv.org/pdf/2311.13846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13120v2","updated":"2023-11-23T08:15:18Z","published":"2023-11-22T02:46:57Z","title":"Multi-modal In-Context Learning Makes an Ego-evolving Scene Text\n Recognizer","summary":" Scene text recognition (STR) in the wild frequently encounters challenges\nwhen coping with domain variations, font diversity, shape deformations, etc. A\nstraightforward solution is performing model fine-tuning tailored to a specific\nscenario, but it is computationally intensive and requires multiple model\ncopies for various scenarios. Recent studies indicate that large language\nmodels (LLMs) can learn from a few demonstration examples in a training-free\nmanner, termed \"In-Context Learning\" (ICL). Nevertheless, applying LLMs as a\ntext recognizer is unacceptably resource-consuming. Moreover, our pilot\nexperiments on LLMs show that ICL fails in STR, mainly attributed to the\ninsufficient incorporation of contextual information from diverse samples in\nthe training stage. To this end, we introduce E$^2$STR, a STR model trained\nwith context-rich scene text sequences, where the sequences are generated via\nour proposed in-context training strategy. E$^2$STR demonstrates that a\nregular-sized model is sufficient to achieve effective ICL capabilities in STR.\nExtensive experiments show that E$^2$STR exhibits remarkable training-free\nadaptation in various scenarios and outperforms even the fine-tuned\nstate-of-the-art approaches on public benchmarks.\n","authors":["Zhen Zhao","Jingqun Tang","Chunhui Lin","Binghong Wu","Hao Liu","Zhizhong Zhang","Xin Tan","Can Huang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2311.13120v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13833v1","updated":"2023-11-23T07:33:38Z","published":"2023-11-23T07:33:38Z","title":"Lego: Learning to Disentangle and Invert Concepts Beyond Object\n Appearance in Text-to-Image Diffusion Models","summary":" Diffusion models have revolutionized generative content creation and\ntext-to-image (T2I) diffusion models in particular have increased the creative\nfreedom of users by allowing scene synthesis using natural language. T2I models\nexcel at synthesizing concepts such as nouns, appearances, and styles. To\nenable customized content creation based on a few example images of a concept,\nmethods such as Textual Inversion and DreamBooth invert the desired concept and\nenable synthesizing it in new scenes. However, inverting more general concepts\nthat go beyond object appearance and style (adjectives and verbs) through\nnatural language, remains a challenge. Two key characteristics of these\nconcepts contribute to the limitations of current inversion methods. 1)\nAdjectives and verbs are entangled with nouns (subject) and can hinder\nappearance-based inversion methods, where the subject appearance leaks into the\nconcept embedding and 2) describing such concepts often extends beyond single\nword embeddings (being frozen in ice, walking on a tightrope, etc.) that\ncurrent methods do not handle.\n In this study, we introduce Lego, a textual inversion method designed to\ninvert subject entangled concepts from a few example images. Lego disentangles\nconcepts from their associated subjects using a simple yet effective Subject\nSeparation step and employs a Context Loss that guides the inversion of\nsingle/multi-embedding concepts. In a thorough user study, Lego-generated\nconcepts were preferred over 70% of the time when compared to the baseline.\nAdditionally, visual question answering using a large language model suggested\nLego-generated concepts are better aligned with the text description of the\nconcept.\n","authors":["Saman Motamed","Danda Pani Paudel","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.13833v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12862v2","updated":"2023-11-23T07:26:55Z","published":"2023-09-22T13:37:10Z","title":"Associative Transformer Is A Sparse Representation Learner","summary":" Emerging from the monolithic pairwise attention mechanism in conventional\nTransformer models, there is a growing interest in leveraging sparse\ninteractions that align more closely with biological principles. Approaches\nincluding the Set Transformer and the Perceiver employ cross-attention\nconsolidated with a latent space that forms an attention bottleneck with\nlimited capacity. Building upon recent neuroscience studies of Global Workspace\nTheory and associative memory, we propose the Associative Transformer (AiT).\nAiT induces low-rank explicit memory that serves as both priors to guide\nbottleneck attention in the shared workspace and attractors within associative\nmemory of a Hopfield network. Through joint end-to-end training, these priors\nnaturally develop module specialization, each contributing a distinct inductive\nbias to form attention bottlenecks. A bottleneck can foster competition among\ninputs for writing information into the memory. We show that AiT is a sparse\nrepresentation learner, learning distinct priors through the bottlenecks that\nare complexity-invariant to input quantities and dimensions. AiT demonstrates\nits superiority over methods such as the Set Transformer, Vision Transformer,\nand Coordination in various vision tasks.\n","authors":["Yuwei Sun","Hideya Ochiai","Zhirong Wu","Stephen Lin","Ryota Kanai"],"pdf_url":"https://arxiv.org/pdf/2309.12862v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.02082v3","updated":"2023-11-23T21:30:39Z","published":"2023-10-20T19:36:03Z","title":"Semantic Modelling of Organizational Knowledge as a Basis for Enterprise\n Data Governance 4.0 -- Application to a Unified Clinical Data Model","summary":" Individuals and organizations cope with an always-growing amount of data,\nwhich is heterogeneous in its contents and formats. An adequate data management\nprocess yielding data quality and control over its lifecycle is a prerequisite\nto getting value out of this data and minimizing inherent risks related to\nmultiple usages. Common data governance frameworks rely on people, policies,\nand processes that fall short of the overwhelming complexity of data. Yet,\nharnessing this complexity is necessary to achieve high-quality standards. The\nlatter will condition any downstream data usage outcome, including generative\nartificial intelligence trained on this data. In this paper, we report our\nconcrete experience establishing a simple, cost-efficient framework that\nenables metadata-driven, agile and (semi-)automated data governance (i.e. Data\nGovernance 4.0). We explain how we implement and use this framework to\nintegrate 25 years of clinical study data at an enterprise scale in a fully\nproductive environment. The framework encompasses both methodologies and\ntechnologies leveraging semantic web principles. We built a knowledge graph\ndescribing avatars of data assets in their business context, including\ngovernance principles. Multiple ontologies articulated by an enterprise upper\nontology enable key governance actions such as FAIRification, lifecycle\nmanagement, definition of roles and responsibilities, lineage across\ntransformations and provenance from source systems. This metadata model is the\nkeystone to data governance 4.0: a semi-automatised data management process\nthat considers the business context in an agile manner to adapt governance\nconstraints to each use case and dynamically tune it based on business changes.\n","authors":["Miguel AP Oliveira","Stephane Manara","Bruno Molé","Thomas Muller","Aurélien Guillouche","Lysann Hesske","Bruce Jordan","Gilles Hubert","Chinmay Kulkarni","Pralipta Jagdev","Cedric R. Berger"],"pdf_url":"https://arxiv.org/pdf/2311.02082v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14084v1","updated":"2023-11-23T16:22:58Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon has elevated the issue of source bias in text retrieval for web\nsearches. Specifically, neural retrieval models tend to rank generated texts\nhigher than human-written texts. In this paper, we extend the study of this\nbias to cross-modal retrieval. Firstly, we successfully construct a suitable\nbenchmark to explore the existence of the bias. Subsequent extensive\nexperiments on this benchmark reveal that AI-generated images introduce an\ninvisible relevance bias to text-image retrieval models. Specifically, our\nexperiments show that text-image retrieval models tend to rank the AI-generated\nimages higher than the real images, even though the AI-generated images do not\nexhibit more visually relevant features to the query than real images. This\ninvisible relevance bias is prevalent across retrieval models with varying\ntraining data and architectures. Furthermore, our subsequent exploration\nreveals that the inclusion of AI-generated images in the training data of the\nretrieval models exacerbates the invisible relevance bias. The above phenomenon\ntriggers a vicious cycle, which makes the invisible relevance bias become more\nand more serious. To elucidate the potential causes of invisible relevance and\naddress the aforementioned issues, we introduce an effective training method\naimed at alleviating the invisible relevance bias. Subsequently, we apply our\nproposed debiasing method to retroactively identify the causes of invisible\nrelevance, revealing that the AI-generated images induce the image encoder to\nembed additional information into their representation. This information\nexhibits a certain consistency across generated images with different semantics\nand can make the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2303.04689v3","updated":"2023-11-23T16:00:51Z","published":"2023-03-07T17:22:38Z","title":"A Privacy Preserving System for Movie Recommendations Using Federated\n Learning","summary":" Recommender systems have become ubiquitous in the past years. They solve the\ntyranny of choice problem faced by many users, and are utilized by many online\nbusinesses to drive engagement and sales. Besides other criticisms, like\ncreating filter bubbles within social networks, recommender systems are often\nreproved for collecting considerable amounts of personal data. However, to\npersonalize recommendations, personal information is fundamentally required. A\nrecent distributed learning scheme called federated learning has made it\npossible to learn from personal user data without its central collection.\nConsequently, we present a recommender system for movie recommendations, which\nprovides privacy and thus trustworthiness on multiple levels: First and\nforemost, it is trained using federated learning and thus, by its very nature,\nprivacy-preserving, while still enabling users to benefit from global insights.\nFurthermore, a novel federated learning scheme, called FedQ, is employed, which\nnot only addresses the problem of non-i.i.d.-ness and small local datasets, but\nalso prevents input data reconstruction attacks by aggregating client updates\nearly. Finally, to reduce the communication overhead, compression is applied,\nwhich significantly compresses the exchanged neural network parametrizations to\na fraction of their original size. We conjecture that this may also improve\ndata privacy through its lossy quantization stage.\n","authors":["David Neumann","Andreas Lutz","Karsten Müller","Wojciech Samek"],"pdf_url":"https://arxiv.org/pdf/2303.04689v3.pdf","comment":"Accepted for publication in the ACM Transactions on Recommender\n Systems (TORS) Special Issue on Trustworthy Recommender Systems"},{"id":"http://arxiv.org/abs/2311.13921v1","updated":"2023-11-23T11:14:13Z","published":"2023-11-23T11:14:13Z","title":"Some Like It Small: Czech Semantic Embedding Models for Industry\n Applications","summary":" This article focuses on the development and evaluation of Small-sized Czech\nsentence embedding models. Small models are important components for real-time\nindustry applications in resource-constrained environments. Given the limited\navailability of labeled Czech data, alternative approaches, including\npre-training, knowledge distillation, and unsupervised contrastive fine-tuning,\nare investigated. Comprehensive intrinsic and extrinsic analyses are conducted,\nshowcasing the competitive performance of our models compared to significantly\nlarger counterparts, with approximately 8 times smaller size and 5 times faster\nspeed than conventional Base-sized models. To promote cooperation and\nreproducibility, both the models and the evaluation pipeline are made publicly\naccessible. Ultimately, this article presents practical applications of the\ndeveloped sentence embedding models in Seznam.cz, the Czech search engine.\nThese models have effectively replaced previous counterparts, enhancing the\noverall search experience for instance, in organic search, featured snippets,\nand image search. This transition has yielded improved performance.\n","authors":["Jiří Bednář","Jakub Náplava","Petra Barančíková","Ondřej Lisický"],"pdf_url":"https://arxiv.org/pdf/2311.13921v1.pdf","comment":"Accepted at the Thirty-Sixth Annual Conference on Innovative\n Applications of Artificial Intelligence (IAAI-24). IAAI Innovative\n Application Award. 9 pages"},{"id":"http://arxiv.org/abs/2311.04760v2","updated":"2023-11-23T09:12:45Z","published":"2023-11-08T15:33:06Z","title":"Towards Open-world Cross-Domain Sequential Recommendation: A\n Model-Agnostic Contrastive Denoising Approach","summary":" Cross-domain sequential recommendation (CDSR) aims to address the data\nsparsity problems that exist in traditional sequential recommendation (SR)\nsystems.\n The existing approaches aim to design a specific cross-domain unit that can\ntransfer and propagate information across multiple domains by relying on\noverlapping users with abundant behaviors. However, in real-world recommender\nsystems, CDSR scenarios usually consist of a majority of long-tailed users with\nsparse behaviors and cold-start users who only exist in one domain. This leads\nto a drop in the performance of existing CDSR methods in the real-world\nindustry platform. Therefore, improving the consistency and effectiveness of\nmodels in open-world CDSR scenarios is crucial for constructing CDSR models\n(\\textit{1st} CH). Recently, some SR approaches have utilized auxiliary\nbehaviors to complement the information for long-tailed users. However, these\nmulti-behavior SR methods cannot deliver promising performance in CDSR, as they\noverlook the semantic gap between target and auxiliary behaviors, as well as\nuser interest deviation across domains (\\textit{2nd} CH).\n","authors":["Wujiang Xu","Xuying Ning","Wenfang Lin","Mingming Ha","Qiongxu Ma","Qianqiao Liang","Xuewen Tao","Linxun Chen","Bing Han","Minnan Luo"],"pdf_url":"https://arxiv.org/pdf/2311.04760v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.05238v3","updated":"2023-11-23T05:25:59Z","published":"2023-09-11T05:12:14Z","title":"Generating Natural Language Queries for More Effective Systematic Review\n Screening Prioritisation","summary":" Screening prioritisation in medical systematic reviews aims to rank the set\nof documents retrieved by complex Boolean queries. Prioritising the most\nimportant documents ensures that subsequent review steps can be carried out\nmore efficiently and effectively. The current state of the art uses the final\ntitle of the review as a query to rank the documents using BERT-based neural\nrankers. However, the final title is only formulated at the end of the review\nprocess, which makes this approach impractical as it relies on ex post facto\ninformation. At the time of screening, only a rough working title is available,\nwith which the BERT-based ranker performs significantly worse than with the\nfinal title. In this paper, we explore alternative sources of queries for\nprioritising screening, such as the Boolean query used to retrieve the\ndocuments to be screened and queries generated by instruction-based generative\nlarge-scale language models such as ChatGPT and Alpaca. Our best approach is\nnot only viable based on the information available at the time of screening,\nbut also has similar effectiveness to the final title.\n","authors":["Shuai Wang","Harrisen Scells","Martin Potthast","Bevan Koopman","Guido Zuccon"],"pdf_url":"https://arxiv.org/pdf/2309.05238v3.pdf","comment":"Preprints for Accepted paper in SIGIR-AP-2023, note that this is\n updated from ACM published paper. The working title was wrong in the\n ACM-published version due to a bug in data preprocessing; however, this does\n not have any influence on the final conclusion/observation made from the\n paper"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.14227v1","updated":"2023-11-23T23:40:01Z","published":"2023-11-23T23:40:01Z","title":"Robust and Interpretable COVID-19 Diagnosis on Chest X-ray Images using\n Adversarial Training","summary":" The novel 2019 Coronavirus disease (COVID-19) global pandemic is a defining\nhealth crisis. Recent efforts have been increasingly directed towards achieving\nquick and accurate detection of COVID-19 across symptomatic patients to\nmitigate the intensity and spread of the disease. Artificial intelligence (AI)\nalgorithms applied to chest X-ray (CXR) images have emerged as promising\ndiagnostic tools, and previous work has demonstrated impressive classification\nperformances. However, such methods have faced criticisms from physicians due\nto their black-box reasoning process and unpredictable nature. In contrast to\nprofessional radiologist diagnosis, AI systems often lack generalizability,\nexplainability, and robustness in the clinical decision making process. In our\nwork, we address these issues by first proposing an extensive baseline study,\ntraining and evaluating 21 convolutional neural network (CNN) models on a\ndiverse set of 33,000+ CXR images to classify between healthy, COVID-19, and\nnon-COVID-19 pneumonia CXRs. Our resulting models achieved a 3-way\nclassification accuracy, recall, and precision of up to 97.03\\%, 97.97\\%, and\n99.95\\%, respectively. Next, we investigate the effectiveness of adversarial\ntraining on model robustness and explainability via Gradient-weighted Class\nActivation Mapping (Grad-CAM) heatmaps. We find that adversarially trained\nmodels not only significantly outperform their standard counterparts on\nclassifying perturbed images, but also yield saliency maps that 1) better\nspecify clinically relevant features, 2) are robust against extraneous\nartifacts, and 3) agree considerably more with expert radiologist findings.\n","authors":["Karina Yang","Alexis Bennett","Dominique Duncan"],"pdf_url":"https://arxiv.org/pdf/2311.14227v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09043v2","updated":"2023-11-23T23:39:55Z","published":"2023-08-17T15:24:03Z","title":"Kernel-Based Tests for Likelihood-Free Hypothesis Testing","summary":" Given $n$ observations from two balanced classes, consider the task of\nlabeling an additional $m$ inputs that are known to all belong to \\emph{one} of\nthe two classes. Special cases of this problem are well-known: with complete\nknowledge of class distributions ($n=\\infty$) the problem is solved optimally\nby the likelihood-ratio test; when $m=1$ it corresponds to binary\nclassification; and when $m\\approx n$ it is equivalent to two-sample testing.\nThe intermediate settings occur in the field of likelihood-free inference,\nwhere labeled samples are obtained by running forward simulations and the\nunlabeled sample is collected experimentally. In recent work it was discovered\nthat there is a fundamental trade-off between $m$ and $n$: increasing the data\nsample $m$ reduces the amount $n$ of training/simulation data needed. In this\nwork we (a) introduce a generalization where unlabeled samples come from a\nmixture of the two classes -- a case often encountered in practice; (b) study\nthe minimax sample complexity for non-parametric classes of densities under\n\\textit{maximum mean discrepancy} (MMD) separation; and (c) investigate the\nempirical performance of kernels parameterized by neural networks on two tasks:\ndetection of the Higgs boson and detection of planted DDPM generated images\namidst CIFAR-10 images. For both problems we confirm the existence of the\ntheoretically predicted asymmetric $m$ vs $n$ trade-off.\n","authors":["Patrik Róbert Gerber","Tianze Jiang","Yury Polyanskiy","Rui Sun"],"pdf_url":"https://arxiv.org/pdf/2308.09043v2.pdf","comment":"36 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.14222v1","updated":"2023-11-23T23:02:10Z","published":"2023-11-23T23:02:10Z","title":"Risk Bounds of Accelerated SGD for Overparameterized Linear Regression","summary":" Accelerated stochastic gradient descent (ASGD) is a workhorse in deep\nlearning and often achieves better generalization performance than SGD.\nHowever, existing optimization theory can only explain the faster convergence\nof ASGD, but cannot explain its better generalization. In this paper, we study\nthe generalization of ASGD for overparameterized linear regression, which is\npossibly the simplest setting of learning with overparameterization. We\nestablish an instance-dependent excess risk bound for ASGD within each\neigen-subspace of the data covariance matrix. Our analysis shows that (i) ASGD\noutperforms SGD in the subspace of small eigenvalues, exhibiting a faster rate\nof exponential decay for bias error, while in the subspace of large\neigenvalues, its bias error decays slower than SGD; and (ii) the variance error\nof ASGD is always larger than that of SGD. Our result suggests that ASGD can\noutperform SGD when the difference between the initialization and the true\nweight vector is mostly confined to the subspace of small eigenvalues.\nAdditionally, when our analysis is specialized to linear regression in the\nstrongly convex setting, it yields a tighter bound for bias error than the\nbest-known result.\n","authors":["Xuheng Li","Yihe Deng","Jingfeng Wu","Dongruo Zhou","Quanquan Gu"],"pdf_url":"https://arxiv.org/pdf/2311.14222v1.pdf","comment":"85 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.14220v1","updated":"2023-11-23T22:41:30Z","published":"2023-11-23T22:41:30Z","title":"Assumption-lean and Data-adaptive Post-Prediction Inference","summary":" A primary challenge facing modern scientific research is the limited\navailability of gold-standard data which can be both costly and labor-intensive\nto obtain. With the rapid development of machine learning (ML), scientists have\nrelied on ML algorithms to predict these gold-standard outcomes with easily\nobtained covariates. However, these predicted outcomes are often used directly\nin subsequent statistical analyses, ignoring imprecision and heterogeneity\nintroduced by the prediction procedure. This will likely result in false\npositive findings and invalid scientific conclusions. In this work, we\nintroduce an assumption-lean and data-adaptive Post-Prediction Inference\n(POP-Inf) procedure that allows valid and powerful inference based on\nML-predicted outcomes. Its \"assumption-lean\" property guarantees reliable\nstatistical inference without assumptions on the ML-prediction, for a wide\nrange of statistical quantities. Its \"data-adaptive'\" feature guarantees an\nefficiency gain over existing post-prediction inference methods, regardless of\nthe accuracy of ML-prediction. We demonstrate the superiority and applicability\nof our method through simulations and large-scale genomic data.\n","authors":["Jiacheng Miao","Xinran Miao","Yixuan Wu","Jiwei Zhao","Qiongshi Lu"],"pdf_url":"https://arxiv.org/pdf/2311.14220v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.05954v4","updated":"2023-11-23T22:11:49Z","published":"2022-09-09T23:18:31Z","title":"Automatically Score Tissue Images Like a Pathologist by Transfer\n Learning","summary":" Cancer is the second leading cause of death in the world. Diagnosing cancer\nearly on can save many lives. Pathologists have to look at tissue microarray\n(TMA) images manually to identify tumors, which can be time-consuming,\ninconsistent and subjective. Existing automatic algorithms either have not\nachieved the accuracy level of a pathologist or require substantial human\ninvolvements. A major challenge is that TMA images with different shapes,\nsizes, and locations can have the same score. Learning staining patterns in TMA\nimages requires a huge number of images, which are severely limited due to\nprivacy and regulation concerns in medical organizations. TMA images from\ndifferent cancer types may share certain common characteristics, but combining\nthem directly harms the accuracy due to heterogeneity in their staining\npatterns. Transfer learning is an emerging learning paradigm that allows\nborrowing strength from similar problems. However, existing approaches\ntypically require a large sample from similar learning problems, while TMA\nimages of different cancer types are often available in small sample size and\nfurther existing algorithms are limited to transfer learning from one similar\nproblem. We propose a new transfer learning algorithm that could learn from\nmultiple related problems, where each problem has a small sample and can have a\nsubstantially different distribution from the original one. The proposed\nalgorithm has made it possible to break the critical accuracy barrier (the 75%\naccuracy level of pathologists), with a reported accuracy of 75.9% on breast\ncancer TMA images from the Stanford Tissue Microarray Database. It is supported\nby recent developments in transfer learning theory and empirical evidence in\nclustering technology. This will allow pathologists to confidently adopt\nautomatic algorithms in recognizing tumors consistently with a higher accuracy\nin real time.\n","authors":["Iris Yan"],"pdf_url":"https://arxiv.org/pdf/2209.05954v4.pdf","comment":"19 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.14214v1","updated":"2023-11-23T22:08:29Z","published":"2023-11-23T22:08:29Z","title":"Extending Variability-Aware Model Selection with Bias Detection in\n Machine Learning Projects","summary":" Data science projects often involve various machine learning (ML) methods\nthat depend on data, code, and models. One of the key activities in these\nprojects is the selection of a model or algorithm that is appropriate for the\ndata analysis at hand. ML model selection depends on several factors, which\ninclude data-related attributes such as sample size, functional requirements\nsuch as the prediction algorithm type, and non-functional requirements such as\nperformance and bias. However, the factors that influence such selection are\noften not well understood and explicitly represented. This paper describes\nongoing work on extending an adaptive variability-aware model selection method\nwith bias detection in ML projects. The method involves: (i) modeling the\nvariability of the factors that affect model selection using feature models\nbased on heuristics proposed in the literature; (ii) instantiating our\nvariability model with added features related to bias (e.g., bias-related\nmetrics); and (iii) conducting experiments that illustrate the method in a\nspecific case study to illustrate our approach based on a heart failure\nprediction project. The proposed approach aims to advance the state of the art\nby making explicit factors that influence model selection, particularly those\nrelated to bias, as well as their interactions. The provided representations\ncan transform model selection in ML projects into a non ad hoc, adaptive, and\nexplainable process.\n","authors":["Cristina Tavares","Nathalia Nascimento","Paulo Alencar","Donald Cowan"],"pdf_url":"https://arxiv.org/pdf/2311.14214v1.pdf","comment":"IEEE BigData 2023"},{"id":"http://arxiv.org/abs/2311.14212v1","updated":"2023-11-23T21:54:22Z","published":"2023-11-23T21:54:22Z","title":"Annotation Sensitivity: Training Data Collection Methods Affect Model\n Performance","summary":" When training data are collected from human annotators, the design of the\nannotation instrument, the instructions given to annotators, the\ncharacteristics of the annotators, and their interactions can impact training\ndata. This study demonstrates that design choices made when creating an\nannotation instrument also impact the models trained on the resulting\nannotations.\n We introduce the term annotation sensitivity to refer to the impact of\nannotation data collection methods on the annotations themselves and on\ndownstream model performance and predictions.\n We collect annotations of hate speech and offensive language in five\nexperimental conditions of an annotation instrument, randomly assigning\nannotators to conditions. We then fine-tune BERT models on each of the five\nresulting datasets and evaluate model performance on a holdout portion of each\ncondition. We find considerable differences between the conditions for 1) the\nshare of hate speech/offensive language annotations, 2) model performance, 3)\nmodel predictions, and 4) model learning curves.\n Our results emphasize the crucial role played by the annotation instrument\nwhich has received little attention in the machine learning literature. We call\nfor additional research into how and why the instrument impacts the annotations\nto inform the development of best practices in instrument design.\n","authors":["Christoph Kern","Stephanie Eckman","Jacob Beck","Rob Chew","Bolei Ma","Frauke Kreuter"],"pdf_url":"https://arxiv.org/pdf/2311.14212v1.pdf","comment":"EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2306.15030v2","updated":"2023-11-23T21:53:19Z","published":"2023-06-26T19:40:10Z","title":"Equivariant flow matching","summary":" Normalizing flows are a class of deep generative models that are especially\ninteresting for modeling probability distributions in physics, where the exact\nlikelihood of flows allows reweighting to known target energy functions and\ncomputing unbiased observables. For instance, Boltzmann generators tackle the\nlong-standing sampling problem in statistical physics by training flows to\nproduce equilibrium samples of many-body systems such as small molecules and\nproteins. To build effective models for such systems, it is crucial to\nincorporate the symmetries of the target energy into the model, which can be\nachieved by equivariant continuous normalizing flows (CNFs). However, CNFs can\nbe computationally expensive to train and generate samples from, which has\nhampered their scalability and practical application. In this paper, we\nintroduce equivariant flow matching, a new training objective for equivariant\nCNFs that is based on the recently proposed optimal transport flow matching.\nEquivariant flow matching exploits the physical symmetries of the target energy\nfor efficient, simulation-free training of equivariant CNFs. We demonstrate the\neffectiveness of flow matching on rotation and permutation invariant\nmany-particle systems and a small molecule, alanine dipeptide, where for the\nfirst time we obtain a Boltzmann generator with significant sampling efficiency\nwithout relying on tailored internal coordinate featurization. Our results show\nthat the equivariant flow matching objective yields flows with shorter\nintegration paths, improved sampling efficiency, and higher scalability\ncompared to existing methods.\n","authors":["Leon Klein","Andreas Krämer","Frank Noé"],"pdf_url":"https://arxiv.org/pdf/2306.15030v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05784v2","updated":"2023-11-23T20:45:53Z","published":"2023-11-09T23:25:29Z","title":"Are \"Hierarchical\" Visual Representations Hierarchical?","summary":" Learned visual representations often capture large amounts of semantic\ninformation for accurate downstream applications. Human understanding of the\nworld is fundamentally grounded in hierarchy. To mimic this and further improve\nrepresentation capabilities, the community has explored \"hierarchical\" visual\nrepresentations that aim at modeling the underlying hierarchy of the visual\nworld. In this work, we set out to investigate if hierarchical visual\nrepresentations truly capture the human perceived hierarchy better than\nstandard learned representations. To this end, we create HierNet, a suite of 12\ndatasets spanning 3 kinds of hierarchy from the BREEDs subset of ImageNet.\nAfter extensive evaluation of Hyperbolic and Matryoshka Representations across\ntraining setups, we conclude that they do not capture hierarchy any better than\nthe standard representations but can assist in other aspects like search\nefficiency and interpretability. Our benchmark and the datasets are\nopen-sourced at https://github.com/ethanlshen/HierNet.\n","authors":["Ethan Shen","Ali Farhadi","Aditya Kusupati"],"pdf_url":"https://arxiv.org/pdf/2311.05784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14197v1","updated":"2023-11-23T20:41:46Z","published":"2023-11-23T20:41:46Z","title":"Enhancing mTBI Diagnosis with Residual Triplet Convolutional Neural\n Network Using 3D CT","summary":" Mild Traumatic Brain Injury (mTBI) is a common and challenging condition to\ndiagnose accurately. Timely and precise diagnosis is essential for effective\ntreatment and improved patient outcomes. Traditional diagnostic methods for\nmTBI often have limitations in terms of accuracy and sensitivity. In this\nstudy, we introduce an innovative approach to enhance mTBI diagnosis using 3D\nComputed Tomography (CT) images and a metric learning technique trained with\ntriplet loss. To address these challenges, we propose a Residual Triplet\nConvolutional Neural Network (RTCNN) model to distinguish between mTBI cases\nand healthy ones by embedding 3D CT scans into a feature space. The triplet\nloss function maximizes the margin between similar and dissimilar image pairs,\noptimizing feature representations. This facilitates better context placement\nof individual cases, aids informed decision-making, and has the potential to\nimprove patient outcomes. Our RTCNN model shows promising performance in mTBI\ndiagnosis, achieving an average accuracy of 94.3%, a sensitivity of 94.1%, and\na specificity of 95.2%, as confirmed through a five-fold cross-validation.\nImportantly, when compared to the conventional Residual Convolutional Neural\nNetwork (RCNN) model, the RTCNN exhibits a significant improvement, showcasing\na remarkable 22.5% increase in specificity, a notable 16.2% boost in accuracy,\nand an 11.3% enhancement in sensitivity. Moreover, RTCNN requires lower memory\nresources, making it not only highly effective but also resource-efficient in\nminimizing false positives while maximizing its diagnostic accuracy in\ndistinguishing normal CT scans from mTBI cases. The quantitative performance\nmetrics provided and utilization of occlusion sensitivity maps to visually\nexplain the model's decision-making process further enhance the\ninterpretability and transparency of our approach.\n","authors":["Hanem Ellethy","Shekhar S. Chandra","Viktor Vegh"],"pdf_url":"https://arxiv.org/pdf/2311.14197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14195v1","updated":"2023-11-23T20:31:48Z","published":"2023-11-23T20:31:48Z","title":"Touch Analysis: An Empirical Evaluation of Machine Learning\n Classification Algorithms on Touch Data","summary":" Our research aims at classifying individuals based on their unique\ninteractions on touchscreen-based smartphones. In this research, we use\nTouch-Analytics datasets, which include 41 subjects and 30 different behavioral\nfeatures. Furthermore, we derived new features from the raw data to improve the\noverall authentication performance. Previous research has already been done on\nthe Touch-Analytics datasets with the state-of-the-art classifiers, including\nSupport Vector Machine (SVM) and k-nearest neighbor (kNN), and achieved equal\nerror rates (EERs) between 0% to 4%. Here, we propose a novel Deep Neural Net\n(DNN) architecture to classify the individuals correctly. The proposed DNN\narchitecture has three dense layers and uses many-to-many mapping techniques.\nWhen we combine the new features with the existing ones, SVM and kNN achieved\nthe classification accuracy of 94.7% and 94.6%, respectively. This research\nexplored seven other classifiers and out of them, the decision tree and our\nproposed DNN classifiers resulted in the highest accuracy of 100%. The others\nincluded: Logistic Regression (LR), Linear Discriminant Analysis (LDA),\nGaussian Naive Bayes (NB), Neural Network, and VGGNet with the following\naccuracy scores of 94.7%, 95.9%, 31.9%, 88.8%, and 96.1%, respectively.\n","authors":["Melodee Montgomery","Prosenjit Chatterjee","John Jenkins","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2311.14195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14182v1","updated":"2023-11-23T20:03:51Z","published":"2023-11-23T20:03:51Z","title":"Gradient-based bilevel optimization for multi-penalty Ridge regression\n through matrix differential calculus","summary":" Common regularization algorithms for linear regression, such as LASSO and\nRidge regression, rely on a regularization hyperparameter that balances the\ntradeoff between minimizing the fitting error and the norm of the learned model\ncoefficients. As this hyperparameter is scalar, it can be easily selected via\nrandom or grid search optimizing a cross-validation criterion. However, using a\nscalar hyperparameter limits the algorithm's flexibility and potential for\nbetter generalization. In this paper, we address the problem of linear\nregression with l2-regularization, where a different regularization\nhyperparameter is associated with each input variable. We optimize these\nhyperparameters using a gradient-based approach, wherein the gradient of a\ncross-validation criterion with respect to the regularization hyperparameters\nis computed analytically through matrix differential calculus. Additionally, we\nintroduce two strategies tailored for sparse model learning problems aiming at\nreducing the risk of overfitting to the validation data. Numerical examples\ndemonstrate that our multi-hyperparameter regularization approach outperforms\nLASSO, Ridge, and Elastic Net regression. Moreover, the analytical computation\nof the gradient proves to be more efficient in terms of computational time\ncompared to automatic differentiation, especially when handling a large number\nof input variables. Application to the identification of over-parameterized\nLinear Parameter-Varying models is also presented.\n","authors":["Gabriele Maroni","Loris Cannelli","Dario Piga"],"pdf_url":"https://arxiv.org/pdf/2311.14182v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14156v2","updated":"2023-11-23T19:50:21Z","published":"2023-09-25T14:08:21Z","title":"Designing and evaluating an online reinforcement learning agent for\n physical exercise recommendations in N-of-1 trials","summary":" Personalized adaptive interventions offer the opportunity to increase patient\nbenefits, however, there are challenges in their planning and implementation.\nOnce implemented, it is an important question whether personalized adaptive\ninterventions are indeed clinically more effective compared to a fixed gold\nstandard intervention. In this paper, we present an innovative N-of-1 trial\nstudy design testing whether implementing a personalized intervention by an\nonline reinforcement learning agent is feasible and effective. Throughout, we\nuse a new study on physical exercise recommendations to reduce pain in\nendometriosis for illustration. We describe the design of a contextual bandit\nrecommendation agent and evaluate the agent in simulation studies. The results\nshow that, first, implementing a personalized intervention by an online\nreinforcement learning agent is feasible. Second, such adaptive interventions\nhave the potential to improve patients' benefits even if only few observations\nare available. As one challenge, they add complexity to the design and\nimplementation process. In order to quantify the expected benefit, data from\nprevious interventional studies is required. We expect our approach to be\ntransferable to other interventions and clinical interventions.\n","authors":["Dominik Meier","Ipek Ensari","Stefan Konigorski"],"pdf_url":"https://arxiv.org/pdf/2309.14156v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14177v1","updated":"2023-11-23T19:49:59Z","published":"2023-11-23T19:49:59Z","title":"TCuPGAN: A novel framework developed for optimizing human-machine\n interactions in citizen science","summary":" In the era of big data in scientific research, there is a necessity to\nleverage techniques which reduce human effort in labeling and categorizing\nlarge datasets by involving sophisticated machine tools. To combat this\nproblem, we present a novel, general purpose model for 3D segmentation that\nleverages patch-wise adversariality and Long Short-Term Memory to encode\nsequential information. Using this model alongside citizen science projects\nwhich use 3D datasets (image cubes) on the Zooniverse platforms, we propose an\niterative human-machine optimization framework where only a fraction of the 2D\nslices from these cubes are seen by the volunteers. We leverage the patch-wise\ndiscriminator in our model to provide an estimate of which slices within these\nimage cubes have poorly generalized feature representations, and\ncorrespondingly poor machine performance. These images with corresponding\nmachine proposals would be presented to volunteers on Zooniverse for\ncorrection, leading to a drastic reduction in the volunteer effort on citizen\nscience projects. We trained our model on ~2300 liver tissue 3D electron\nmicrographs. Lipid droplets were segmented within these images through human\nannotation via the `Etch A Cell - Fat Checker' citizen science project, hosted\non the Zooniverse platform. In this work, we demonstrate this framework and the\nselection methodology which resulted in a measured reduction in volunteer\neffort by more than 60%. We envision this type of joint human-machine\npartnership will be of great use on future Zooniverse projects.\n","authors":["Ramanakumar Sankar","Kameswara Mantha","Lucy Fortson","Helen Spiers","Thomas Pengo","Douglas Mashek","Myat Mo","Mark Sanders","Trace Christensen","Jeffrey Salisbury","Laura Trouille"],"pdf_url":"https://arxiv.org/pdf/2311.14177v1.pdf","comment":"5 pages, 1 figure, accepted for publication at HLDM '23 (ECML PKDD\n 2023 workshop)"},{"id":"http://arxiv.org/abs/2311.00860v2","updated":"2023-11-23T19:41:41Z","published":"2023-11-01T21:28:24Z","title":"Zero Coordinate Shift: Whetted Automatic Differentiation for\n Physics-informed Operator Learning","summary":" Automatic differentiation (AD) is a critical step in physics-informed machine\nlearning, required for computing the high-order derivatives of network output\nw.r.t. coordinates of collocation points. In this paper, we present a novel and\nlightweight algorithm to conduct AD for physics-informed operator learning,\nwhich we call the trick of Zero Coordinate Shift (ZCS). Instead of making all\nsampled coordinates as leaf variables, ZCS introduces only one scalar-valued\nleaf variable for each spatial or temporal dimension, simplifying the wanted\nderivatives from \"many-roots-many-leaves\" to \"one-root-many-leaves\" whereby\nreverse-mode AD becomes directly utilisable. It has led to an outstanding\nperformance leap by avoiding the duplication of the computational graph along\nthe dimension of functions (physical parameters). ZCS is easy to implement with\ncurrent deep learning libraries; our own implementation is achieved by\nextending the DeepXDE package. We carry out a comprehensive benchmark analysis\nand several case studies, training physics-informed DeepONets to solve partial\ndifferential equations (PDEs) without data. The results show that ZCS has\npersistently reduced GPU memory consumption and wall time for training by an\norder of magnitude, and such reduction factor scales with the number of\nfunctions. As a low-level optimisation technique, ZCS imposes no restrictions\non data, physics (PDE) or network architecture and does not compromise training\nresults from any aspect.\n","authors":["Kuangdai Leng","Mallikarjun Shankar","Jeyan Thiyagalingam"],"pdf_url":"https://arxiv.org/pdf/2311.00860v2.pdf","comment":"19 pages; this minor revision gives clearer explanation on the reason\n of performance boost by ZCS"},{"id":"http://arxiv.org/abs/2311.14169v1","updated":"2023-11-23T19:20:59Z","published":"2023-11-23T19:20:59Z","title":"Evaluating GPT-4's Vision Capabilities on Brazilian University Admission\n Exams","summary":" Recent advancements in language models have showcased human-comparable\nperformance in academic entrance exams. However, existing studies often\noverlook questions that require the integration of visual comprehension, thus\ncompromising the full spectrum and complexity inherent in real-world scenarios.\nTo address this gap, we present a comprehensive framework to evaluate language\nmodels on entrance exams, which incorporates both textual and visual elements.\nWe evaluate the two most recent editions of Exame Nacional do Ensino M\\'edio\n(ENEM), the main standardized entrance examination adopted by Brazilian\nuniversities. Our study not only reaffirms the capabilities of GPT-4 as the\nstate of the art for handling complex multidisciplinary questions, but also\npioneers in offering a realistic assessment of multimodal language models on\nPortuguese examinations. One of the highlights is that text captions\ntranscribing visual content outperform the direct use of images, suggesting\nthat the vision model has room for improvement. Yet, despite improvements\nafforded by images or captions, mathematical questions remain a challenge for\nthese state-of-the-art models. The code and data used on experiments are\navailable at https://github.com/piresramon/gpt-4-enem.\n","authors":["Ramon Pires","Thales Sales Almeida","Hugo Abonizio","Rodrigo Nogueira"],"pdf_url":"https://arxiv.org/pdf/2311.14169v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.17003"},{"id":"http://arxiv.org/abs/2311.12716v2","updated":"2023-11-23T19:12:07Z","published":"2023-11-21T16:43:13Z","title":"minimax: Efficient Baselines for Autocurricula in JAX","summary":" Unsupervised environment design (UED) is a form of automatic curriculum\nlearning for training robust decision-making agents to zero-shot transfer into\nunseen environments. Such autocurricula have received much interest from the RL\ncommunity. However, UED experiments, based on CPU rollouts and GPU model\nupdates, have often required several weeks of training. This compute\nrequirement is a major obstacle to rapid innovation for the field. This work\nintroduces the minimax library for UED training on accelerated hardware. Using\nJAX to implement fully-tensorized environments and autocurriculum algorithms,\nminimax allows the entire training loop to be compiled for hardware\nacceleration. To provide a petri dish for rapid experimentation, minimax\nincludes a tensorized grid-world based on MiniGrid, in addition to reusable\nabstractions for conducting autocurricula in procedurally-generated\nenvironments. With these components, minimax provides strong UED baselines,\nincluding new parallelized variants, which achieve over 120$\\times$ speedups in\nwall time compared to previous implementations when training with equal batch\nsizes. The minimax library is available under the Apache 2.0 license at\nhttps://github.com/facebookresearch/minimax.\n","authors":["Minqi Jiang","Michael Dennis","Edward Grefenstette","Tim Rocktäschel"],"pdf_url":"https://arxiv.org/pdf/2311.12716v2.pdf","comment":"Presented at ALOE 2023"},{"id":"http://arxiv.org/abs/2311.14168v1","updated":"2023-11-23T19:08:39Z","published":"2023-11-23T19:08:39Z","title":"Fast Policy Learning for Linear Quadratic Regulator with Entropy\n Regularization","summary":" This paper proposes and analyzes two new policy learning methods: regularized\npolicy gradient (RPG) and iterative policy optimization (IPO), for a class of\ndiscounted linear-quadratic regulator (LQR) problems over an infinite time\nhorizon with entropy regularization. Assuming access to the exact policy\nevaluation, both proposed approaches are proved to converge linearly in finding\noptimal policies of the regularized LQR. Moreover, the IPO method can achieve a\nsuper-linear convergence rate once it enters a local region around the optimal\npolicy. Finally, when the optimal policy from a well-understood environment in\nan RL problem is appropriately transferred as the initial policy to an RL\nproblem with an unknown environment, the IPO method is shown to enable a\nsuper-linear convergence rate if the latter is sufficiently close to the\nformer. The performances of these proposed algorithms are supported by\nnumerical examples.\n","authors":["Xin Guo","Xinyu Li","Renyuan Xu"],"pdf_url":"https://arxiv.org/pdf/2311.14168v1.pdf","comment":"33 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.14160v1","updated":"2023-11-23T19:00:02Z","published":"2023-11-23T19:00:02Z","title":"Efficient and Robust Jet Tagging at the LHC with Knowledge Distillation","summary":" The challenging environment of real-time data processing systems at the Large\nHadron Collider (LHC) strictly limits the computational complexity of\nalgorithms that can be deployed. For deep learning models, this implies that\nonly models with low computational complexity that have weak inductive bias are\nfeasible. To address this issue, we utilize knowledge distillation to leverage\nboth the performance of large models and the reduced computational complexity\nof small ones. In this paper, we present an implementation of knowledge\ndistillation, demonstrating an overall boost in the student models' performance\nfor the task of classifying jets at the LHC. Furthermore, by using a teacher\nmodel with a strong inductive bias of Lorentz symmetry, we show that we can\ninduce the same inductive bias in the student model which leads to better\nrobustness against arbitrary Lorentz boost.\n","authors":["Ryan Liu","Abhijith Gandrakota","Jennifer Ngadiuba","Maria Spiropulu","Jean-Roch Vlimant"],"pdf_url":"https://arxiv.org/pdf/2311.14160v1.pdf","comment":"7 pages, 3 figures, accepted at the Machine Learning and the Physical\n Sciences Workshop, NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.14156v1","updated":"2023-11-23T18:56:51Z","published":"2023-11-23T18:56:51Z","title":"Variational Annealing on Graphs for Combinatorial Optimization","summary":" Several recent unsupervised learning methods use probabilistic approaches to\nsolve combinatorial optimization (CO) problems based on the assumption of\nstatistically independent solution variables. We demonstrate that this\nassumption imposes performance limitations in particular on difficult problem\ninstances. Our results corroborate that an autoregressive approach which\ncaptures statistical dependencies among solution variables yields superior\nperformance on many popular CO problems. We introduce subgraph tokenization in\nwhich the configuration of a set of solution variables is represented by a\nsingle token. This tokenization technique alleviates the drawback of the long\nsequential sampling procedure which is inherent to autoregressive methods\nwithout sacrificing expressivity. Importantly, we theoretically motivate an\nannealed entropy regularization and show empirically that it is essential for\nefficient and stable learning.\n","authors":["Sebastian Sanokowski","Wilhelm Berghammer","Sepp Hochreiter","Sebastian Lehner"],"pdf_url":"https://arxiv.org/pdf/2311.14156v1.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.14153v1","updated":"2023-11-23T18:54:25Z","published":"2023-11-23T18:54:25Z","title":"Tube-NeRF: Efficient Imitation Learning of Visuomotor Policies from MPC\n using Tube-Guided Data Augmentation and NeRFs","summary":" Imitation learning (IL) can train computationally-efficient sensorimotor\npolicies from a resource-intensive Model Predictive Controller (MPC), but it\noften requires many samples, leading to long training times or limited\nrobustness. To address these issues, we combine IL with a variant of robust MPC\nthat accounts for process and sensing uncertainties, and we design a data\naugmentation (DA) strategy that enables efficient learning of vision-based\npolicies. The proposed DA method, named Tube-NeRF, leverages Neural Radiance\nFields (NeRFs) to generate novel synthetic images, and uses properties of the\nrobust MPC (the tube) to select relevant views and to efficiently compute the\ncorresponding actions. We tailor our approach to the task of localization and\ntrajectory tracking on a multirotor, by learning a visuomotor policy that\ngenerates control actions using images from the onboard camera as only source\nof horizontal position. Our evaluations numerically demonstrate learning of a\nrobust visuomotor policy with an 80-fold increase in demonstration efficiency\nand a 50% reduction in training time over current IL methods. Additionally, our\npolicies successfully transfer to a real multirotor, achieving accurate\nlocalization and low tracking errors despite large disturbances, with an\nonboard inference time of only 1.5 ms.\n","authors":["Andrea Tagliabue","Jonathan P. How"],"pdf_url":"https://arxiv.org/pdf/2311.14153v1.pdf","comment":"Video: https://youtu.be/_W5z33ZK1m4. Evolved paper from our previous\n work: arXiv:2210.10127"},{"id":"http://arxiv.org/abs/2307.07871v2","updated":"2023-11-23T18:45:29Z","published":"2023-07-15T19:05:56Z","title":"The SocialAI School: Insights from Developmental Psychology Towards\n Artificial Socio-Cultural Agents","summary":" Developmental psychologists have long-established the importance of\nsocio-cognitive abilities in human intelligence. These abilities enable us to\nenter, participate and benefit from human culture. AI research on social\ninteractive agents mostly concerns the emergence of culture in a multi-agent\nsetting (often without a strong grounding in developmental psychology). We\nargue that AI research should be informed by psychology and study\nsocio-cognitive abilities enabling to enter a culture too. We discuss the\ntheories of Michael Tomasello and Jerome Bruner to introduce some of their\nconcepts to AI and outline key concepts and socio-cognitive abilities. We\npresent The SocialAI school - a tool including a customizable parameterized\nuite of procedurally generated environments, which simplifies conducting\nexperiments regarding those concepts. We show examples of such experiments with\nRL agents and Large Language Models. The main motivation of this work is to\nengage the AI community around the problem of social intelligence informed by\ndevelopmental psychology, and to provide a tool to simplify first steps in this\ndirection. Refer to the project website for code and additional information:\nhttps://sites.google.com/view/socialai-school.\n","authors":["Grgur Kovač","Rémy Portelas","Peter Ford Dominey","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2307.07871v2.pdf","comment":"Preprint, see v1 for a shorter version (accepted at the \"Workshop on\n Theory-of-Mind\" at ICML 2023) See project website for demo and code:\n https://sites.google.com/view/socialai-school"},{"id":"http://arxiv.org/abs/2311.14148v1","updated":"2023-11-23T18:37:26Z","published":"2023-11-23T18:37:26Z","title":"Automated 3D Tumor Segmentation using Temporal Cubic PatchGAN (TCuP-GAN)","summary":" Development of robust general purpose 3D segmentation frameworks using the\nlatest deep learning techniques is one of the active topics in various\nbio-medical domains. In this work, we introduce Temporal Cubic PatchGAN\n(TCuP-GAN), a volume-to-volume translational model that marries the concepts of\na generative feature learning framework with Convolutional Long Short-Term\nMemory Networks (LSTMs), for the task of 3D segmentation. We demonstrate the\ncapabilities of our TCuP-GAN on the data from four segmentation challenges\n(Adult Glioma, Meningioma, Pediatric Tumors, and Sub-Saharan Africa subset)\nfeatured within the 2023 Brain Tumor Segmentation (BraTS) Challenge and\nquantify its performance using LesionWise Dice similarity and $95\\%$ Hausdorff\nDistance metrics. We demonstrate the successful learning of our framework to\npredict robust multi-class segmentation masks across all the challenges. This\nbenchmarking work serves as a stepping stone for future efforts towards\napplying TCuP-GAN on other multi-class tasks such as multi-organelle\nsegmentation in electron microscopy imaging.\n","authors":["Kameswara Bharadwaj Mantha","Ramanakumar Sankar","Lucy Fortson"],"pdf_url":"https://arxiv.org/pdf/2311.14148v1.pdf","comment":"Submitted as a short paper to the proceedings of the 2023 Brain Tumor\n Segmentation (BraTS) Challenge"},{"id":"http://arxiv.org/abs/2311.14139v1","updated":"2023-11-23T18:13:34Z","published":"2023-11-23T18:13:34Z","title":"Machine Learning For An Explainable Cost Prediction of Medical Insurance","summary":" Predictive modeling in healthcare continues to be an active actuarial\nresearch topic as more insurance companies aim to maximize the potential of\nMachine Learning approaches to increase their productivity and efficiency. In\nthis paper, the authors deployed three regression-based ensemble ML models that\ncombine variations of decision trees through Extreme Gradient Boosting,\nGradient-boosting Machine, and Random Forest) methods in predicting medical\ninsurance costs. Explainable Artificial Intelligence methods SHapley Additive\nexPlanations and Individual Conditional Expectation plots were deployed to\ndiscover and explain the key determinant factors that influence medical\ninsurance premium prices in the dataset. The dataset used comprised 986 records\nand is publicly available in the KAGGLE repository. The models were evaluated\nusing four performance evaluation metrics, including R-squared, Mean Absolute\nError, Root Mean Squared Error, and Mean Absolute Percentage Error. The results\nshow that all models produced impressive outcomes; however, the XGBoost model\nachieved a better overall performance although it also expanded more\ncomputational resources, while the RF model recorded a lesser prediction error\nand consumed far fewer computing resources than the XGBoost model. Furthermore,\nwe compared the outcome of both XAi methods in identifying the key determinant\nfeatures that influenced the PremiumPrices for each model and whereas both XAi\nmethods produced similar outcomes, we found that the ICE plots showed in more\ndetail the interactions between each variable than the SHAP analysis which\nseemed to be more high-level. It is the aim of the authors that the\ncontributions of this study will help policymakers, insurers, and potential\nmedical insurance buyers in their decision-making process for selecting the\nright policies that meet their specific needs.\n","authors":["Ugochukwu Orji","Elochukwu Ukwandu"],"pdf_url":"https://arxiv.org/pdf/2311.14139v1.pdf","comment":"42 pages, 16 figures and 9 tables"},{"id":"http://arxiv.org/abs/2311.14137v1","updated":"2023-11-23T18:08:15Z","published":"2023-11-23T18:08:15Z","title":"Privacy-Preserving Algorithmic Recourse","summary":" When individuals are subject to adverse outcomes from machine learning\nmodels, providing a recourse path to help achieve a positive outcome is\ndesirable. Recent work has shown that counterfactual explanations - which can\nbe used as a means of single-step recourse - are vulnerable to privacy issues,\nputting an individuals' privacy at risk. Providing a sequential multi-step path\nfor recourse can amplify this risk. Furthermore, simply adding noise to\nrecourse paths found from existing methods can impact the realism and\nactionability of the path for an end-user. In this work, we address privacy\nissues when generating realistic recourse paths based on instance-based\ncounterfactual explanations, and provide PrivRecourse: an end-to-end privacy\npreserving pipeline that can provide realistic recourse paths. PrivRecourse\nuses differentially private (DP) clustering to represent non-overlapping\nsubsets of the private dataset. These DP cluster centers are then used to\ngenerate recourse paths by forming a graph with cluster centers as the nodes,\nso that we can generate realistic - feasible and actionable - recourse paths.\nWe empirically evaluate our approach on finance datasets and compare it to\nsimply adding noise to data instances, and to using DP synthetic data, to\ngenerate the graph. We observe that PrivRecourse can provide paths that are\nprivate and realistic.\n","authors":["Sikha Pentyala","Shubham Sharma","Sanjay Kariyappa","Freddy Lecue","Daniele Magazzeni"],"pdf_url":"https://arxiv.org/pdf/2311.14137v1.pdf","comment":"Accepted at 3rd International Workshop on Explainable AI in Finance,\n ICAIF 2023"},{"id":"http://arxiv.org/abs/2311.14136v1","updated":"2023-11-23T18:06:05Z","published":"2023-11-23T18:06:05Z","title":"A Blockchain Solution for Collaborative Machine Learning over IoT","summary":" The rapid growth of Internet of Things (IoT) devices and applications has led\nto an increased demand for advanced analytics and machine learning techniques\ncapable of handling the challenges associated with data privacy, security, and\nscalability. Federated learning (FL) and blockchain technologies have emerged\nas promising approaches to address these challenges by enabling decentralized,\nsecure, and privacy-preserving model training on distributed data sources. In\nthis paper, we present a novel IoT solution that combines the incremental\nlearning vector quantization algorithm (XuILVQ) with Ethereum blockchain\ntechnology to facilitate secure and efficient data sharing, model training, and\nprototype storage in a distributed environment. Our proposed architecture\naddresses the shortcomings of existing blockchain-based FL solutions by\nreducing computational and communication overheads while maintaining data\nprivacy and security. We assess the performance of our system through a series\nof experiments, showcasing its potential to enhance the accuracy and efficiency\nof machine learning tasks in IoT settings.\n","authors":["Carlos Beis-Penedo","Francisco Troncoso-Pastoriza","Rebeca P. Díaz-Redondo","Ana Fernández-Vilas","Manuel Fernández-Veiga","Martín González Soto"],"pdf_url":"https://arxiv.org/pdf/2311.14136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14131v1","updated":"2023-11-23T17:59:48Z","published":"2023-11-23T17:59:48Z","title":"Exactly conservative physics-informed neural networks and deep operator\n networks for dynamical systems","summary":" We introduce a method for training exactly conservative physics-informed\nneural networks and physics-informed deep operator networks for dynamical\nsystems. The method employs a projection-based technique that maps a candidate\nsolution learned by the neural network solver for any given dynamical system\npossessing at least one first integral onto an invariant manifold. We\nillustrate that exactly conservative physics-informed neural network solvers\nand physics-informed deep operator networks for dynamical systems vastly\noutperform their non-conservative counterparts for several real-world problems\nfrom the mathematical sciences.\n","authors":["Elsa Cardoso-Bihlo","Alex Bihlo"],"pdf_url":"https://arxiv.org/pdf/2311.14131v1.pdf","comment":"12 pages, 6 figures, 1 algorithm"},{"id":"http://arxiv.org/abs/2311.14127v1","updated":"2023-11-23T17:50:30Z","published":"2023-11-23T17:50:30Z","title":"Byzantine Robustness and Partial Participation Can Be Achieved\n Simultaneously: Just Clip Gradient Differences","summary":" Distributed learning has emerged as a leading paradigm for training large\nmachine learning models. However, in real-world scenarios, participants may be\nunreliable or malicious, posing a significant challenge to the integrity and\naccuracy of the trained models. Byzantine fault tolerance mechanisms have been\nproposed to address these issues, but they often assume full participation from\nall clients, which is not always practical due to the unavailability of some\nclients or communication constraints. In our work, we propose the first\ndistributed method with client sampling and provable tolerance to Byzantine\nworkers. The key idea behind the developed method is the use of gradient\nclipping to control stochastic gradient differences in recursive variance\nreduction. This allows us to bound the potential harm caused by Byzantine\nworkers, even during iterations when all sampled clients are Byzantine.\nFurthermore, we incorporate communication compression into the method to\nenhance communication efficiency. Under quite general assumptions, we prove\nconvergence rates for the proposed method that match the existing\nstate-of-the-art (SOTA) theoretical results.\n","authors":["Grigory Malinovsky","Peter Richtárik","Samuel Horváth","Eduard Gorbunov"],"pdf_url":"https://arxiv.org/pdf/2311.14127v1.pdf","comment":"50 pages; 1 figure"},{"id":"http://arxiv.org/abs/2311.14126v1","updated":"2023-11-23T17:47:14Z","published":"2023-11-23T17:47:14Z","title":"Towards Auditing Large Language Models: Improving Text-based Stereotype\n Detection","summary":" Large Language Models (LLM) have made significant advances in the recent past\nbecoming more mainstream in Artificial Intelligence (AI) enabled human-facing\napplications. However, LLMs often generate stereotypical output inherited from\nhistorical data, amplifying societal biases and raising ethical concerns. This\nwork introduces i) the Multi-Grain Stereotype Dataset, which includes 52,751\ninstances of gender, race, profession and religion stereotypic text and ii) a\nnovel stereotype classifier for English text. We design several experiments to\nrigorously test the proposed model trained on the novel dataset. Our\nexperiments show that training the model in a multi-class setting can\noutperform the one-vs-all binary counterpart. Consistent feature importance\nsignals from different eXplainable AI tools demonstrate that the new model\nexploits relevant text features. We utilise the newly created model to assess\nthe stereotypic behaviour of the popular GPT family of models and observe the\nreduction of bias over time. In summary, our work establishes a robust and\npractical framework for auditing and evaluating the stereotypic bias in LLM.\n","authors":["Wu Zekun","Sahan Bulathwela","Adriano Soares Koshiyama"],"pdf_url":"https://arxiv.org/pdf/2311.14126v1.pdf","comment":"2023 NeurIPS SoLaR Workshop Accepted"},{"id":"http://arxiv.org/abs/2311.14125v1","updated":"2023-11-23T17:46:30Z","published":"2023-11-23T17:46:30Z","title":"Scalable AI Safety via Doubly-Efficient Debate","summary":" The emergence of pre-trained AI systems with powerful capabilities across a\ndiverse and ever-increasing set of complex domains has raised a critical\nchallenge for AI safety as tasks can become too complicated for humans to judge\ndirectly. Irving et al. [2018] proposed a debate method in this direction with\nthe goal of pitting the power of such AI models against each other until the\nproblem of identifying (mis)-alignment is broken down into a manageable\nsubtask. While the promise of this approach is clear, the original framework\nwas based on the assumption that the honest strategy is able to simulate\ndeterministic AI systems for an exponential number of steps, limiting its\napplicability. In this paper, we show how to address these challenges by\ndesigning a new set of debate protocols where the honest strategy can always\nsucceed using a simulation of a polynomial number of steps, whilst being able\nto verify the alignment of stochastic AI systems, even when the dishonest\nstrategy is allowed to use exponentially many simulation steps.\n","authors":["Jonah Brown-Cohen","Geoffrey Irving","Georgios Piliouras"],"pdf_url":"https://arxiv.org/pdf/2311.14125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14120v1","updated":"2023-11-23T17:30:31Z","published":"2023-11-23T17:30:31Z","title":"Weight fluctuations in (deep) linear neural networks and a derivation of\n the inverse-variance flatness relation","summary":" We investigate the stationary (late-time) training regime of single- and\ntwo-layer linear neural networks within the continuum limit of stochastic\ngradient descent (SGD) for synthetic Gaussian data. In the case of a\nsingle-layer network in the weakly oversampled regime, the spectrum of the\nnoise covariance matrix deviates notably from the Hessian, which can be\nattributed to the broken detailed balance of SGD dynamics. The weight\nfluctuations are in this case generally anisotropic, but experience an\nisotropic loss. For a two-layer network, we obtain the stochastic dynamics of\nthe weights in each layer and analyze the associated stationary covariances. We\nidentify the inter-layer coupling as a new source of anisotropy for the weight\nfluctuations. In contrast to the single-layer case, the weight fluctuations\nexperience an anisotropic loss, the flatness of which is inversely related to\nthe fluctuation variance. We thereby provide an analytical derivation of the\nrecently observed inverse variance-flatness relation in a deep linear network\nmodel.\n","authors":["Markus Gross","Arne P. Raulf","Christoph Räth"],"pdf_url":"https://arxiv.org/pdf/2311.14120v1.pdf","comment":"25 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.11762v2","updated":"2023-11-23T17:26:53Z","published":"2023-11-20T13:40:40Z","title":"MUVO: A Multimodal Generative World Model for Autonomous Driving with\n Geometric Representations","summary":" Learning unsupervised world models for autonomous driving has the potential\nto improve the reasoning capabilities of today's systems dramatically. However,\nmost work neglects the physical attributes of the world and focuses on sensor\ndata alone. We propose MUVO, a MUltimodal World Model with Geometric VOxel\nRepresentations to address this challenge. We utilize raw camera and lidar data\nto learn a sensor-agnostic geometric representation of the world, which can\ndirectly be used by downstream tasks, such as planning. We demonstrate\nmultimodal future predictions and show that our geometric representation\nimproves the prediction quality of both camera images and lidar point clouds.\n","authors":["Daniel Bogdoll","Yitian Yang","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.11762v2.pdf","comment":"Daniel Bogdoll and Yitian Yang contributed equally"},{"id":"http://arxiv.org/abs/2311.14115v1","updated":"2023-11-23T17:20:36Z","published":"2023-11-23T17:20:36Z","title":"A density estimation perspective on learning from pairwise human\n preferences","summary":" Learning from human feedback (LHF) -- and in particular learning from\npairwise preferences -- has recently become a crucial ingredient in training\nlarge language models (LLMs), and has been the subject of much research. Most\nrecent works frame it as a reinforcement learning problem, where a reward\nfunction is learned from pairwise preference data and the LLM is treated as a\npolicy which is adapted to maximize the rewards, often under additional\nregularization constraints. We propose an alternative interpretation which\ncenters on the generative process for pairwise preferences and treats LHF as a\ndensity estimation problem. We provide theoretical and empirical results\nshowing that for a family of generative processes defined via preference\nbehavior distribution equations, training a reward function on pairwise\npreferences effectively models an annotator's implicit preference distribution.\nFinally, we discuss and present findings on \"annotator misspecification\" --\nfailure cases where wrong modeling assumptions are made about annotator\nbehavior, resulting in poorly-adapted models -- suggesting that approaches that\nlearn from pairwise human preferences could have trouble learning from a\npopulation of annotators with diverse viewpoints.\n","authors":["Vincent Dumoulin","Daniel D. Johnson","Pablo Samuel Castro","Hugo Larochelle","Yann Dauphin"],"pdf_url":"https://arxiv.org/pdf/2311.14115v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14114v1","updated":"2023-11-23T17:20:09Z","published":"2023-11-23T17:20:09Z","title":"SySMOL: A Hardware-software Co-design Framework for Ultra-Low and\n Fine-Grained Mixed-Precision Neural Networks","summary":" Recent advancements in quantization and mixed-precision techniques offer\nsignificant promise for improving the run-time and energy efficiency of neural\nnetworks. In this work, we further showed that neural networks, wherein\nindividual parameters or activations can take on different precisions ranging\nbetween 1 and 4 bits, can achieve accuracies comparable to or exceeding the\nfull-precision counterparts. However, the deployment of such networks poses\nnumerous challenges, stemming from the necessity to manage and control the\ncompute/communication/storage requirements associated with these extremely\nfine-grained mixed precisions for each piece of data. There is a lack of\nexisting efficient hardware and system-level support tailored to these unique\nand challenging requirements. Our research introduces the first novel holistic\nhardware-software co-design approach for these networks, which enables a\ncontinuous feedback loop between hardware design, training, and inference to\nfacilitate systematic design exploration. As a proof-of-concept, we illustrate\nthis co-design approach by designing new, configurable CPU SIMD architectures\ntailored for these networks, tightly integrating the architecture with new\nsystem-aware training and inference techniques. We perform systematic design\nspace exploration using this framework to analyze various tradeoffs. The design\nfor mixed-precision networks that achieves optimized tradeoffs corresponds to\nan architecture that supports 1, 2, and 4-bit fixed-point operations with four\nconfigurable precision patterns, when coupled with system-aware training and\ninference optimization -- networks trained for this design achieve accuracies\nthat closely match full-precision accuracies, while compressing and improving\nrun-time efficiency of the neural networks drastically by 10-20x, compared to\nfull-precision networks.\n","authors":["Cyrus Zhou","Vaughn Richard","Pedro Savarese","Zachary Hassman","Michael Maire","Michael DiBrino","Yanjing Li"],"pdf_url":"https://arxiv.org/pdf/2311.14114v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14110v1","updated":"2023-11-23T17:13:37Z","published":"2023-11-23T17:13:37Z","title":"When is Off-Policy Evaluation Useful? A Data-Centric Perspective","summary":" Evaluating the value of a hypothetical target policy with only a logged\ndataset is important but challenging. On the one hand, it brings opportunities\nfor safe policy improvement under high-stakes scenarios like clinical\nguidelines. On the other hand, such opportunities raise a need for precise\noff-policy evaluation (OPE). While previous work on OPE focused on improving\nthe algorithm in value estimation, in this work, we emphasize the importance of\nthe offline dataset, hence putting forward a data-centric framework for\nevaluating OPE problems. We propose DataCOPE, a data-centric framework for\nevaluating OPE, that answers the questions of whether and to what extent we can\nevaluate a target policy given a dataset. DataCOPE (1) forecasts the overall\nperformance of OPE algorithms without access to the environment, which is\nespecially useful before real-world deployment where evaluating OPE is\nimpossible; (2) identifies the sub-group in the dataset where OPE can be\ninaccurate; (3) permits evaluations of datasets or data-collection strategies\nfor OPE problems. Our empirical analysis of DataCOPE in the logged contextual\nbandit settings using healthcare datasets confirms its ability to evaluate both\nmachine-learning and human expert policies like clinical guidelines.\n","authors":["Hao Sun","Alex J. Chan","Nabeel Seedat","Alihan Hüyük","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2311.14110v1.pdf","comment":"Off-Policy Evaluation, Data-Centric AI, Data-Centric Reinforcement\n Learning, Reinforcement Learning"},{"id":"http://arxiv.org/abs/2311.14108v1","updated":"2023-11-23T17:09:12Z","published":"2023-11-23T17:09:12Z","title":"MINTY: Rule-based Models that Minimize the Need for Imputing Features\n with Missing Values","summary":" Rule models are often preferred in prediction tasks with tabular inputs as\nthey can be easily interpreted using natural language and provide predictive\nperformance on par with more complex models. However, most rule models'\npredictions are undefined or ambiguous when some inputs are missing, forcing\nusers to rely on statistical imputation models or heuristics like zero\nimputation, undermining the interpretability of the models. In this work, we\npropose fitting concise yet precise rule models that learn to avoid relying on\nfeatures with missing values and, therefore, limit their reliance on imputation\nat test time. We develop MINTY, a method that learns rules in the form of\ndisjunctions between variables that act as replacements for each other when one\nor more is missing. This results in a sparse linear rule model, regularized to\nhave small dependence on features with missing values, that allows a trade-off\nbetween goodness of fit, interpretability, and robustness to missing values at\ntest time. We demonstrate the value of MINTY in experiments using synthetic and\nreal-world data sets and find its predictive performance comparable or\nfavorable to baselines, with smaller reliance on features with missing values.\n","authors":["Lena Stempfle","Fredrik D. Johansson"],"pdf_url":"https://arxiv.org/pdf/2311.14108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.02205v3","updated":"2023-11-23T17:04:26Z","published":"2022-03-04T09:31:20Z","title":"Evaluating Object (mis)Detection from a Safety and Reliability\n Perspective: Discussion and Measures","summary":" We argue that object detectors in the safety critical domain should\nprioritize detection of objects that are most likely to interfere with the\nactions of the autonomous actor. Especially, this applies to objects that can\nimpact the actor's safety and reliability. To quantify the impact of object\n(mis)detection on safety and reliability in the context of autonomous driving,\nwe propose new object detection measures that reward the correct identification\nof objects that are most dangerous and most likely to affect driving decisions.\nTo achieve this, we build an object criticality model to reward the detection\nof the objects based on proximity, orientation, and relative velocity with\nrespect to the subject vehicle. Then, we apply our model on the recent\nautonomous driving dataset nuScenes, and we compare nine object detectors.\nResults show that, in several settings, object detectors that perform best\naccording to the nuScenes ranking are not the preferable ones when the focus is\nshifted on safety and reliability.\n","authors":["Andrea Ceccarelli","Leonardo Montecchi"],"pdf_url":"https://arxiv.org/pdf/2203.02205v3.pdf","comment":"journal version, open access"},{"id":"http://arxiv.org/abs/2311.14101v1","updated":"2023-11-23T17:01:16Z","published":"2023-11-23T17:01:16Z","title":"Subnetwork Ensembles","summary":" Neural network ensembles have been effectively used to improve generalization\nby combining the predictions of multiple independently trained models. However,\nthe growing scale and complexity of deep neural networks have led to these\nmethods becoming prohibitively expensive and time consuming to implement.\nLow-cost ensemble methods have become increasingly important as they can\nalleviate the need to train multiple models from scratch while retaining the\ngeneralization benefits that traditional ensemble learning methods afford. This\ndissertation introduces and formalizes a low-cost framework for constructing\nSubnetwork Ensembles, where a collection of child networks are formed by\nsampling, perturbing, and optimizing subnetworks from a trained parent model.\nWe explore several distinct methodologies for generating child networks and we\nevaluate their efficacy through a variety of ablation studies and established\nbenchmarks. Our findings reveal that this approach can greatly improve training\nefficiency, parametric utilization, and generalization performance while\nminimizing computational cost. Subnetwork Ensembles offer a compelling\nframework for exploring how we can build better systems by leveraging the\nunrealized potential of deep neural networks.\n","authors":["Tim Whitaker"],"pdf_url":"https://arxiv.org/pdf/2311.14101v1.pdf","comment":"116 Pages, 21 figures, Accepted PhD Dissertation"},{"id":"http://arxiv.org/abs/2311.14094v1","updated":"2023-11-23T16:39:55Z","published":"2023-11-23T16:39:55Z","title":"Robust Decision Aggregation with Second-order Information","summary":" We consider a decision aggregation problem with two experts who each make a\nbinary recommendation after observing a private signal about an unknown binary\nworld state. An agent, who does not know the joint information structure\nbetween signals and states, sees the experts' recommendations and aims to match\nthe action with the true state. Under the scenario, we study whether\nsupplemented additionally with second-order information (each expert's forecast\non the other's recommendation) could enable a better aggregation.\n We adopt a minimax regret framework to evaluate the aggregator's performance,\nby comparing it to an omniscient benchmark that knows the joint information\nstructure. With general information structures, we show that second-order\ninformation provides no benefit. No aggregator can improve over a trivial\naggregator, which always follows the first expert's recommendation. However,\npositive results emerge when we assume experts' signals are conditionally\nindependent given the world state. When the aggregator is deterministic, we\npresent a robust aggregator that leverages second-order information, which can\nsignificantly outperform counterparts without it. Second, when two experts are\nhomogeneous, by adding a non-degenerate assumption on the signals, we\ndemonstrate that random aggregators using second-order information can surpass\noptimal ones without it. In the remaining settings, the second-order\ninformation is not beneficial. We also extend the above results to the setting\nwhen the aggregator's utility function is more general.\n","authors":["Yuqi Pan","Zhaohua Chen","Yuqing Kong"],"pdf_url":"https://arxiv.org/pdf/2311.14094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14090v1","updated":"2023-11-23T16:36:03Z","published":"2023-11-23T16:36:03Z","title":"Class Uncertainty: A Measure to Mitigate Class Imbalance","summary":" Class-wise characteristics of training examples affect the performance of\ndeep classifiers. A well-studied example is when the number of training\nexamples of classes follows a long-tailed distribution, a situation that is\nlikely to yield sub-optimal performance for under-represented classes. This\nclass imbalance problem is conventionally addressed by approaches relying on\nthe class-wise cardinality of training examples, such as data resampling. In\nthis paper, we demonstrate that considering solely the cardinality of classes\ndoes not cover all issues causing class imbalance. To measure class imbalance,\nwe propose \"Class Uncertainty\" as the average predictive uncertainty of the\ntraining examples, and we show that this novel measure captures the differences\nacross classes better than cardinality. We also curate SVCI-20 as a novel\ndataset in which the classes have equal number of training examples but they\ndiffer in terms of their hardness; thereby causing a type of class imbalance\nwhich cannot be addressed by the approaches relying on cardinality. We\nincorporate our \"Class Uncertainty\" measure into a diverse set of ten class\nimbalance mitigation methods to demonstrate its effectiveness on long-tailed\ndatasets as well as on our SVCI-20. Code and datasets will be made available.\n","authors":["Z. S. Baltaci","K. Oksuz","S. Kuzucu","K. Tezoren","B. K. Konar","A. Ozkan","E. Akbas","S. Kalkan"],"pdf_url":"https://arxiv.org/pdf/2311.14090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17400v2","updated":"2023-11-23T16:27:42Z","published":"2023-05-27T07:55:17Z","title":"Query-Policy Misalignment in Preference-Based Reinforcement Learning","summary":" Preference-based reinforcement learning (PbRL) provides a natural way to\nalign RL agents' behavior with human desired outcomes, but is often restrained\nby costly human feedback. To improve feedback efficiency, most existing PbRL\nmethods focus on selecting queries to maximally improve the overall quality of\nthe reward model, but counter-intuitively, we find that this may not\nnecessarily lead to improved performance. To unravel this mystery, we identify\na long-neglected issue in the query selection schemes of existing PbRL studies:\nQuery-Policy Misalignment. We show that the seemingly informative queries\nselected to improve the overall quality of reward model actually may not align\nwith RL agents' interests, thus offering little help on policy learning and\neventually resulting in poor feedback efficiency. We show that this issue can\nbe effectively addressed via near on-policy query and a specially designed\nhybrid experience replay, which together enforce the bidirectional query-policy\nalignment. Simple yet elegant, our method can be easily incorporated into\nexisting approaches by changing only a few lines of code. We showcase in\ncomprehensive experiments that our method achieves substantial gains in both\nhuman feedback and RL sample efficiency, demonstrating the importance of\naddressing query-policy misalignment in PbRL tasks.\n","authors":["Xiao Hu","Jianxiong Li","Xianyuan Zhan","Qing-Shan Jia","Ya-Qin Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.17400v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14790v2","updated":"2023-11-23T16:27:37Z","published":"2023-10-20T16:53:31Z","title":"Weighted Joint Maximum Mean Discrepancy Enabled\n Multi-Source-Multi-Target Unsupervised Domain Adaptation Fault Diagnosis","summary":" Despite the remarkable results that can be achieved by data-driven\nintelligent fault diagnosis techniques, they presuppose the same distribution\nof training and test data as well as sufficient labeled data. Various operating\nstates often exist in practical scenarios, leading to the problem of domain\nshift that hinders the effectiveness of fault diagnosis. While recent\nunsupervised domain adaptation methods enable cross-domain fault diagnosis,\nthey struggle to effectively utilize information from multiple source domains\nand achieve effective diagnosis faults in multiple target domains\nsimultaneously. In this paper, we innovatively proposed a weighted joint\nmaximum mean discrepancy enabled multi-source-multi-target unsupervised domain\nadaptation (WJMMD-MDA), which realizes domain adaptation under\nmulti-source-multi-target scenarios in the field of fault diagnosis for the\nfirst time. The proposed method extracts sufficient information from multiple\nlabeled source domains and achieves domain alignment between source and target\ndomains through an improved weighted distance loss. As a result,\ndomain-invariant and discriminative features between multiple source and target\ndomains are learned with cross-domain fault diagnosis realized. The performance\nof the proposed method is evaluated in comprehensive comparative experiments on\nthree datasets, and the experimental results demonstrate the superiority of\nthis method.\n","authors":["Zixuan Wang","Haoran Tang","Haibo Wang","Bo Qin","Mark D. Butala","Weiming Shen","Hongwei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.14790v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00574v3","updated":"2023-11-23T16:24:39Z","published":"2023-10-01T05:11:54Z","title":"YFlows: Systematic Dataflow Exploration and Code Generation for\n Efficient Neural Network Inference using SIMD Architectures on CPUs","summary":" We address the challenges associated with deploying neural networks on CPUs,\nwith a particular focus on minimizing inference time while maintaining\naccuracy. Our novel approach is to use the dataflow (i.e., computation order)\nof a neural network to explore data reuse opportunities using heuristic-guided\nanalysis and a code generation framework, which enables exploration of various\nSingle Instruction, Multiple Data (SIMD) implementations to achieve optimized\nneural network execution. Our results demonstrate that the dataflow that keeps\noutputs in SIMD registers while also maximizing both input and weight reuse\nconsistently yields the best performance for a wide variety of inference\nworkloads, achieving up to 3x speedup for 8-bit neural networks, and up to 4.8x\nspeedup for binary neural networks, respectively, over the optimized\nimplementations of neural networks today.\n","authors":["Cyrus Zhou","Zack Hassman","Ruize Xu","Dhirpal Shah","Vaugnn Richard","Yanjing Li"],"pdf_url":"https://arxiv.org/pdf/2310.00574v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.14091v1","updated":"2023-11-23T16:36:40Z","published":"2023-11-23T16:36:40Z","title":"PortfolioMentor: Multimodal Generative AI Companion for Learning and\n Crafting Interactive Digital Art Portfolios","summary":" Digital art portfolios serve as impactful mediums for artists to convey their\nvisions, weaving together visuals, audio, interactions, and narratives.\nHowever, without technical backgrounds, design students often find it\nchallenging to translate creative ideas into tangible codes and designs, given\nthe lack of tailored resources for the non-technical, academic support in art\nschools, and a comprehensive guiding tool throughout the mentally demanding\nprocess. Recognizing the role of companionship in code learning and leveraging\ngenerative AI models' capabilities in supporting creative tasks, we present\nPortfolioMentor, a coding companion chatbot for IDEs. This tool guides and\ncollaborates with students through proactive suggestions and responsible Q&As\nfor learning, inspiration, and support. In detail, the system starts with the\nunderstanding of the task and artist's visions, follows the co-creation of\nvisual illustrations, audio or music suggestions and files, click-scroll\neffects for interactions, and creative vision conceptualization, and finally\nsynthesizes these facets into a polished interactive digital portfolio.\n","authors":["Tao Long","Weirui Peng"],"pdf_url":"https://arxiv.org/pdf/2311.14091v1.pdf","comment":"3 pages, 1 figure, work in progress"},{"id":"http://arxiv.org/abs/2311.13954v1","updated":"2023-11-23T12:09:49Z","published":"2023-11-23T12:09:49Z","title":"Electric Network Frequency Optical Sensing Devices","summary":" Electric Network Frequency (ENF) acts as a fingerprint in multimedia\nforensics applications. In indoor environments, ENF variations affect the\nintensity of light sources connected to power mains. Accordingly, the light\nintensity variations captured by sensing devices can be exploited to estimate\nthe ENF. A first optical sensing device based on a photodiode is developed for\ncapturing ENF variations in indoor lighting environments. In addition, a device\nthat captures the ENF directly from power mains is implemented. This device\nserves as a ground truth ENF collector. Video recordings captured by a camera\nare also employed to estimate the ENF. The camera serves as a second optical\nsensor. The factors affecting the ENF estimation are thoroughly studied. The\nmaximum correlation coefficient between the ENF estimated by the two optical\nsensors and that estimated directly from power mains is used to measure the\nestimation accuracy. The paper's major contribution is in the disclosure of\nextensive experimental evidence on ENF estimation in scenes ranging from static\nones capturing a white wall to non-static ones, including human activity.\n","authors":["Christos Moysiadis","Georgios Karantaidis","Constantine Kotropoulos"],"pdf_url":"https://arxiv.org/pdf/2311.13954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13946v1","updated":"2023-11-23T11:55:40Z","published":"2023-11-23T11:55:40Z","title":"Weakly-Supervised Video Moment Retrieval via Regularized Two-Branch\n Proposal Networks with Erasing Mechanism","summary":" Video moment retrieval is to identify the target moment according to the\ngiven sentence in an untrimmed video. Due to temporal boundary annotations of\nthe video are extremely time-consuming to acquire, modeling in the\nweakly-supervised setting is increasingly focused, where we only have access to\nthe video-sentence pairs during training. Most existing weakly-supervised\nmethods adopt a MIL-based framework to develop inter-sample confrontment, but\nneglect the intra-sample confrontment between moments with similar semantics.\nTherefore, these methods fail to distinguish the correct moment from plausible\nnegative moments. Further, the previous attention models in cross-modal\ninteraction tend to focus on a few dominant words exorbitantly, ignoring the\ncomprehensive video-sentence correspondence. In this paper, we propose a novel\nRegularized Two-Branch Proposal Network with Erasing Mechanism to consider the\ninter-sample and intra-sample confrontments simultaneously. Concretely, we\nfirst devise a language-aware visual filter to generate both enhanced and\nsuppressed video streams. Then, we design the sharable two-branch proposal\nmodule to generate positive and plausible negative proposals from the enhanced\nand suppressed branch respectively, contributing to sufficient confrontment.\nBesides, we introduce an attention-guided dynamic erasing mechanism in enhanced\nbranch to discover the complementary video-sentence relation. Moreover, we\napply two types of proposal regularization to stabilize the training process\nand improve model performance. The extensive experiments on ActivityCaption,\nCharades-STA and DiDeMo datasets show the effectiveness of our method.\n","authors":["Haoyuan Li","Zhou Zhao","Zhu Zhang","Zhijie Lin"],"pdf_url":"https://arxiv.org/pdf/2311.13946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13770v1","updated":"2023-11-23T01:53:02Z","published":"2023-11-23T01:53:02Z","title":"Archiving Body Movements: Collective Generation of Chinese Calligraphy","summary":" As a communication channel, body movements have been widely explored in\nbehavioral studies and kinesics. Performing and visual arts share the same\ninterests but focus on documenting and representing human body movements, such\nas for dance notation and visual work creation. This paper investigates body\nmovements in oriental calligraphy and how to apply calligraphy principles to\nstimulate and archive body movements. Through an artwork (Wushu), the authors\nexperiment with an interactive and generative approach to engage the audience's\nbodily participation and archive the body movements as a compendium of\ngenerated calligraphy. The audience assumes the role of both writers and\nreaders; creating (\"writing\") and appreciating (\"reading\") the generated\ncalligraphy becomes a cyclical process within this infinite \"Book,\" which can\nmotivate further attention and discussions concerning Chinese characters and\ncalligraphy.\n","authors":["Aven Le Zhou","Jiayi Ye","Tianchen Liu","Kang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13770v1.pdf","comment":"8 pages, 8 figures"}]},"2023-11-27T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.16101v1","updated":"2023-11-27T18:59:42Z","published":"2023-11-27T18:59:42Z","title":"How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for\n Vision LLMs","summary":" This work focuses on the potential of Vision LLMs (VLLMs) in visual\nreasoning. Different from prior studies, we shift our focus from evaluating\nstandard performance to introducing a comprehensive safety evaluation suite,\ncovering both out-of-distribution (OOD) generalization and adversarial\nrobustness. For the OOD evaluation, we present two novel VQA datasets, each\nwith one variant, designed to test model performance under challenging\nconditions. In exploring adversarial robustness, we propose a straightforward\nattack strategy for misleading VLLMs to produce visual-unrelated responses.\nMoreover, we assess the efficacy of two jailbreaking strategies, targeting\neither the vision or language component of VLLMs. Our evaluation of 21 diverse\nmodels, ranging from open-source VLLMs to GPT-4V, yields interesting\nobservations: 1) Current VLLMs struggle with OOD texts but not images, unless\nthe visual information is limited; and 2) These VLLMs can be easily misled by\ndeceiving vision encoders only, and their vision-language training often\ncompromise safety protocols. We release this safety evaluation suite at\nhttps://github.com/UCSC-VLAA/vllm-safety-benchmark.\n","authors":["Haoqin Tu","Chenhang Cui","Zijun Wang","Yiyang Zhou","Bingchen Zhao","Junlin Han","Wangchunshu Zhou","Huaxiu Yao","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2311.16101v1.pdf","comment":"H.T., C.C., and Z.W. contribute equally. Work done during H.T. and\n Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC"},{"id":"http://arxiv.org/abs/2311.16087v1","updated":"2023-11-27T18:56:14Z","published":"2023-11-27T18:56:14Z","title":"DUnE: Dataset for Unified Editing","summary":" Even the most advanced language models remain susceptible to errors\nnecessitating to modify these models without initiating a comprehensive\nretraining process. Model editing refers to the modification of a model's\nknowledge or representations in a manner that produces the desired outcomes.\nPrior research primarily centered around editing factual data e.g. \"Messi plays\nfor Inter Miami\" confining the definition of an edit to a knowledge triplet\ni.e. (subject, object, relation). However, as the applications of language\nmodels expand, so do the diverse ways in which we wish to edit and refine their\noutputs. In this study, we broaden the scope of the editing problem to include\nan array of editing cases such as debiasing and rectifying reasoning errors and\ndefine an edit as any natural language expression that solicits a change in the\nmodel's outputs. We are introducing DUnE-an editing benchmark where edits are\nnatural language sentences and propose that DUnE presents a challenging yet\nrelevant task. To substantiate this claim, we conduct an extensive series of\nexperiments testing various editing approaches to address DUnE, demonstrating\ntheir respective strengths and weaknesses. We show that retrieval-augmented\nlanguage modeling can outperform specialized editing techniques and neither set\nof approaches has fully solved the generalized editing problem covered by our\nbenchmark.\n","authors":["Afra Feyza Akyürek","Eric Pan","Garry Kuwanto","Derry Wijaya"],"pdf_url":"https://arxiv.org/pdf/2311.16087v1.pdf","comment":"Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.16083v1","updated":"2023-11-27T18:53:31Z","published":"2023-11-27T18:53:31Z","title":"BERT Goes Off-Topic: Investigating the Domain Transfer Challenge using\n Genre Classification","summary":" While performance of many text classification tasks has been recently\nimproved due to Pre-trained Language Models (PLMs), in this paper we show that\nthey still suffer from a performance gap when the underlying distribution of\ntopics changes. For example, a genre classifier trained on \\textit{political}\ntopics often fails when tested on documents about \\textit{sport} or\n\\textit{medicine}. In this work, we quantify this phenomenon empirically with a\nlarge corpus and a large set of topics. Consequently, we verify that domain\ntransfer remains challenging both for classic PLMs, such as BERT, and for\nmodern large models, such as GPT-3. We also suggest and successfully test a\npossible remedy: after augmenting the training dataset with\ntopically-controlled synthetic texts, the F1 score improves by up to 50\\% for\nsome topics, nearing on-topic training results, while others show little to no\nimprovement. While our empirical results focus on genre classification, our\nmethodology is applicable to other classification tasks such as gender,\nauthorship, or sentiment classification. The code and data to replicate the\nexperiments are available at https://github.com/dminus1/genre\n","authors":["Dmitri Roussinov","Serge Sharoff"],"pdf_url":"https://arxiv.org/pdf/2311.16083v1.pdf","comment":"Published at EMNLP'2023"},{"id":"http://arxiv.org/abs/2311.16079v1","updated":"2023-11-27T18:49:43Z","published":"2023-11-27T18:49:43Z","title":"MEDITRON-70B: Scaling Medical Pretraining for Large Language Models","summary":" Large language models (LLMs) can potentially democratize access to medical\nknowledge. While many efforts have been made to harness and improve LLMs'\nmedical knowledge and reasoning capacities, the resulting models are either\nclosed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters),\nwhich restricts their abilities. In this work, we improve access to large-scale\nmedical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B\nparameters adapted to the medical domain. MEDITRON builds on Llama-2 (through\nour adaptation of Nvidia's Megatron-LM distributed trainer), and extends\npretraining on a comprehensively curated medical corpus, including selected\nPubMed articles, abstracts, and internationally-recognized medical guidelines.\nEvaluations using four major medical benchmarks show significant performance\ngains over several state-of-the-art baselines before and after task-specific\nfinetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the\nbest public baseline in its parameter class and 3% over the strongest baseline\nwe finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B\noutperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of\nMed-PaLM-2. We release our code for curating the medical pretraining corpus and\nthe MEDITRON model weights to drive open-source development of more capable\nmedical LLMs.\n","authors":["Zeming Chen","Alejandro Hernández Cano","Angelika Romanou","Antoine Bonnet","Kyle Matoba","Francesco Salvi","Matteo Pagliardini","Simin Fan","Andreas Köpf","Amirkeivan Mohtashami","Alexandre Sallinen","Alireza Sakhaeirad","Vinitra Swamy","Igor Krawczuk","Deniz Bayazit","Axel Marmet","Syrielle Montariol","Mary-Anne Hartley","Martin Jaggi","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2311.16079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16075v1","updated":"2023-11-27T18:46:17Z","published":"2023-11-27T18:46:17Z","title":"BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical\n Knowledge Graph Insights","summary":" In this study, we investigate the potential of Large Language Models to\ncomplement biomedical knowledge graphs in the training of semantic models for\nthe biomedical and clinical domains. Drawing on the wealth of the UMLS\nknowledge graph and harnessing cutting-edge Large Language Models, we propose a\nnew state-of-the-art approach for obtaining high-fidelity representations of\nbiomedical concepts and sentences, consisting of three steps: an improved\ncontrastive learning phase, a novel self-distillation phase, and a weight\naveraging phase. Through rigorous evaluations via the extensive BioLORD testing\nsuite and diverse downstream tasks, we demonstrate consistent and substantial\nperformance improvements over the previous state of the art (e.g. +2pts on\nMedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new\nstate-of-the-art biomedical model for English, we also distill and release a\nmultilingual model compatible with 50+ languages and finetuned on 7 European\nlanguages. Many clinical pipelines can benefit from our latest models. Our new\nmultilingual model enables a range of languages to benefit from our\nadvancements in biomedical semantic representation learning, opening a new\navenue for bioinformatics researchers around the world. As a result, we hope to\nsee BioLORD-2023 becoming a precious tool for future biomedical applications.\n","authors":["François Remy","Kris Demuynck","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.16075v1.pdf","comment":"Preprint of upcoming journal article"},{"id":"http://arxiv.org/abs/2107.10021v2","updated":"2023-11-27T18:09:19Z","published":"2021-07-21T11:31:57Z","title":"Neuradicon: operational representation learning of neuroimaging reports","summary":" Radiological reports typically summarize the content and interpretation of\nimaging studies in unstructured form that precludes quantitative analysis. This\nlimits the monitoring of radiological services to throughput undifferentiated\nby content, impeding specific, targeted operational optimization. Here we\npresent Neuradicon, a natural language processing (NLP) framework for\nquantitative analysis of neuroradiological reports. Our framework is a hybrid\nof rule-based and artificial intelligence models to represent neurological\nreports in succinct, quantitative form optimally suited to operational\nguidance. We demonstrate the application of Neuradicon to operational\nphenotyping of a corpus of 336,569 reports, and report excellent\ngeneralizability across time and two independent healthcare institutions.\n","authors":["Henry Watkins","Robert Gray","Adam Julius","Yee-Haur Mah","Walter H. L. Pinaya","Paul Wright","Ashwani Jha","Holger Engleitner","Jorge Cardoso","Sebastien Ourselin","Geraint Rees","Rolf Jaeger","Parashkev Nachev"],"pdf_url":"https://arxiv.org/pdf/2107.10021v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2308.14306v3","updated":"2023-11-27T17:43:20Z","published":"2023-08-28T04:57:07Z","title":"Evaluating the Robustness to Instructions of Large Language Models","summary":" Recently, Instruction fine-tuning has risen to prominence as a potential\nmethod for enhancing the zero-shot capabilities of Large Language Models (LLMs)\non novel tasks. This technique has shown an exceptional ability to boost the\nperformance of moderately sized LLMs, sometimes even reaching performance\nlevels comparable to those of much larger model variants. The focus is on the\nrobustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an\nexploration of six models including Alpaca, Vicuna, WizardLM, and Traditional\nTask-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction\ndatasets as case studies. We carried out a comprehensive evaluation of these\ninstruction-following LLMs which have been tuned based on open-domain\ninstructions and task-oriented instructions. The main discussion is their\nperformance and robustness towards instructions. We have observed that in most\ncases, the model's performance in dealing with unfamiliar instructions tends to\nworsen significantly, and the robustness of the model for RE instructions\ndeteriorates compared to QA. Further, we discovered that up until a certain\nparameter size threshold (3B), the performance of the FLAN-T5 model improves as\nthe parameter count increases. The robustness of different scales of FLAN-T5\nmodels to RE instruction is worse than the robustness to QA instruction.\n","authors":["Yuansheng Ni","Sichao Jiang","Xinyu wu","Hui Shen","Yuli Zhou"],"pdf_url":"https://arxiv.org/pdf/2308.14306v3.pdf","comment":"There were major problems with the experimental data"},{"id":"http://arxiv.org/abs/2310.06627v2","updated":"2023-11-27T16:59:39Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40\\% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14353v2","updated":"2023-11-27T16:55:29Z","published":"2023-11-24T08:53:52Z","title":"Average Token Delay: A Duration-aware Latency Metric for Simultaneous\n Translation","summary":" Simultaneous translation is a task in which the translation begins before the\nend of an input speech segment. Its evaluation should be conducted based on\nlatency in addition to quality, and for users, the smallest possible amount of\nlatency is preferable. Most existing metrics measure latency based on the start\ntimings of partial translations and ignore their duration. This means such\nmetrics do not penalize the latency caused by long translation output, which\ndelays the comprehension of users and subsequent translations. In this work, we\npropose a novel latency evaluation metric for simultaneous translation called\n\\emph{Average Token Delay} (ATD) that focuses on the duration of partial\ntranslations. We demonstrate its effectiveness through analyses simulating\nuser-side latency based on Ear-Voice Span (EVS). In our experiment, ATD had the\nhighest correlation with EVS among baseline latency metrics under most\nconditions.\n","authors":["Yasumasa Kano","Katsuhito Sudoh","Satoshi Nakamura"],"pdf_url":"https://arxiv.org/pdf/2311.14353v2.pdf","comment":"Extended version of the paper (doi: 10.21437/Interspeech.2023-933)\n which appeared in INTERSPEECH 2023"},{"id":"http://arxiv.org/abs/2311.15983v1","updated":"2023-11-27T16:28:20Z","published":"2023-11-27T16:28:20Z","title":"Sparsify-then-Classify: From Internal Neurons of Large Language Models\n To Efficient Text Classifiers","summary":" Among the many tasks that Large Language Models (LLMs) have revolutionized is\ntext classification. However, existing approaches for applying pretrained LLMs\nto text classification predominantly rely on using single token outputs from\nonly the last layer of hidden states. As a result, they suffer from limitations\nin efficiency, task-specificity, and interpretability. In our work, we\ncontribute an approach that uses all internal representations by employing\nmultiple pooling strategies on all activation and hidden states. Our novel\nlightweight strategy, Sparsify-then-Classify (STC) first sparsifies\ntask-specific features layer-by-layer, then aggregates across layers for text\nclassification. STC can be applied as a seamless plug-and-play module on top of\nexisting LLMs. Our experiments on a comprehensive set of models and datasets\ndemonstrate that STC not only consistently improves the classification\nperformance of pretrained and fine-tuned models, but is also more efficient for\nboth training and inference, and is more intrinsically interpretable.\n","authors":["Yilun Liu","Difan Jiao","Ashton Anderson"],"pdf_url":"https://arxiv.org/pdf/2311.15983v1.pdf","comment":"23 pages, 5 figures, 8 tables Code available at\n https://github.com/difanj0713/Sparsify-then-Classify"},{"id":"http://arxiv.org/abs/2311.15964v1","updated":"2023-11-27T16:07:37Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15954v1","updated":"2023-11-27T15:58:28Z","published":"2023-11-27T15:58:28Z","title":"A Quantitative Approach to Understand Self-Supervised Models as\n Cross-lingual Feature Extractors","summary":" In this work, we study the features extracted by English self-supervised\nlearning (SSL) models in cross-lingual contexts and propose a new metric to\npredict the quality of feature representations. Using automatic speech\nrecognition (ASR) as a downstream task, we analyze the effect of model size,\ntraining objectives, and model architecture on the models' performance as a\nfeature extractor for a set of topologically diverse corpora. We develop a\nnovel metric, the Phonetic-Syntax Ratio (PSR), to measure the phonetic and\nsynthetic information in the extracted representations using deep generalized\ncanonical correlation analysis. Results show the contrastive loss in the\nwav2vec2.0 objective facilitates more effective cross-lingual feature\nextraction. There is a positive correlation between PSR scores and ASR\nperformance, suggesting that phonetic information extracted by monolingual SSL\nmodels can be used for downstream tasks in cross-lingual settings. The proposed\nmetric is an effective indicator of the quality of the representations and can\nbe useful for model selection.\n","authors":["Shuyue Stella Li","Beining Xu","Xiangyu Zhang","Hexin Liu","Wenhan Chao","Leibny Paola Garcia"],"pdf_url":"https://arxiv.org/pdf/2311.15954v1.pdf","comment":"12 pages, 5 figures, 4 tables"},{"id":"http://arxiv.org/abs/2311.15946v1","updated":"2023-11-27T15:53:11Z","published":"2023-11-27T15:53:11Z","title":"Leveraging deep active learning to identify low-resource mobility\n functioning information in public clinical notes","summary":" Function is increasingly recognized as an important indicator of whole-person\nhealth, although it receives little attention in clinical natural language\nprocessing research. We introduce the first public annotated dataset\nspecifically on the Mobility domain of the International Classification of\nFunctioning, Disability and Health (ICF), aiming to facilitate automatic\nextraction and analysis of functioning information from free-text clinical\nnotes. We utilize the National NLP Clinical Challenges (n2c2) research dataset\nto construct a pool of candidate sentences using keyword expansion. Our active\nlearning approach, using query-by-committee sampling weighted by density\nrepresentativeness, selects informative sentences for human annotation. We\ntrain BERT and CRF models, and use predictions from these models to guide the\nselection of new sentences for subsequent annotation iterations. Our final\ndataset consists of 4,265 sentences with a total of 11,784 entities, including\n5,511 Action entities, 5,328 Mobility entities, 306 Assistance entities, and\n639 Quantification entities. The inter-annotator agreement (IAA), averaged over\nall entity types, is 0.72 for exact matching and 0.91 for partial matching. We\nalso train and evaluate common BERT models and state-of-the-art Nested NER\nmodels. The best F1 scores are 0.84 for Action, 0.7 for Mobility, 0.62 for\nAssistance, and 0.71 for Quantification. Empirical results demonstrate\npromising potential of NER models to accurately extract mobility functioning\ninformation from clinical text. The public availability of our annotated\ndataset will facilitate further research to comprehensively capture functioning\ninformation in electronic health records (EHRs).\n","authors":["Tuan-Dung Le","Zhuqi Miao","Samuel Alvarado","Brittany Smith","William Paiva","Thanh Thieu"],"pdf_url":"https://arxiv.org/pdf/2311.15946v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15941v1","updated":"2023-11-27T15:49:29Z","published":"2023-11-27T15:49:29Z","title":"Tell2Design: A Dataset for Language-Guided Floor Plan Generation","summary":" We consider the task of generating designs directly from natural language\ndescriptions, and consider floor plan generation as the initial research area.\nLanguage conditional generative models have recently been very successful in\ngenerating high-quality artistic images. However, designs must satisfy\ndifferent constraints that are not present in generating artistic images,\nparticularly spatial and relational constraints. We make multiple contributions\nto initiate research on this task. First, we introduce a novel dataset,\n\\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs\nassociated with natural language instructions. Second, we propose a\nSequence-to-Sequence model that can serve as a strong baseline for future\nresearch. Third, we benchmark this task with several text-conditional image\ngeneration models. We conclude by conducting human evaluations on the generated\nsamples and providing an analysis of human performance. We hope our\ncontributions will propel the research on language-guided design generation\nforward.\n","authors":["Sicong Leng","Yang Zhou","Mohammed Haroon Dupty","Wee Sun Lee","Sam Conrad Joyce","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2311.15941v1.pdf","comment":"Paper published in ACL2023; Area Chair Award; Best Paper Nomination"},{"id":"http://arxiv.org/abs/2311.15930v1","updated":"2023-11-27T15:38:17Z","published":"2023-11-27T15:38:17Z","title":"WorldSense: A Synthetic Benchmark for Grounded Reasoning in Large\n Language Models","summary":" We propose WorldSense, a benchmark designed to assess the extent to which\nLLMs are consistently able to sustain tacit world models, by testing how they\ndraw simple inferences from descriptions of simple arrangements of entities.\nWorldsense is a synthetic benchmark with three problem types, each with their\nown trivial control, which explicitly avoids bias by decorrelating the abstract\nstructure of problems from the vocabulary and expressions, and by decorrelating\nall problem subparts with the correct response. We run our benchmark on three\nstate-of-the-art chat-LLMs (GPT3.5, GPT4 and Llama2-chat) and show that these\nmodels make errors even with as few as three objects. Furthermore, they have\nquite heavy response biases, preferring certain responses irrespective of the\nquestion. Errors persist even with chain-of-thought prompting and in-context\nlearning. Lastly, we show that while finetuning on similar problems does result\nin substantial improvements -- within- and out-of-distribution -- the finetuned\nmodels do not generalise beyond a constraint problem space.\n","authors":["Youssef Benchekroun","Megi Dervishi","Mark Ibrahim","Jean-Baptiste Gaya","Xavier Martinet","Grégoire Mialon","Thomas Scialom","Emmanuel Dupoux","Dieuwke Hupkes","Pascal Vincent"],"pdf_url":"https://arxiv.org/pdf/2311.15930v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07590v2","updated":"2023-11-27T15:17:49Z","published":"2023-11-09T17:12:44Z","title":"Technical Report: Large Language Models can Strategically Deceive their\n Users when Put Under Pressure","summary":" We demonstrate a situation in which Large Language Models, trained to be\nhelpful, harmless, and honest, can display misaligned behavior and\nstrategically deceive their users about this behavior without being instructed\nto do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated\nenvironment, where it assumes the role of an autonomous stock trading agent.\nWithin this environment, the model obtains an insider tip about a lucrative\nstock trade and acts upon it despite knowing that insider trading is\ndisapproved of by company management. When reporting to its manager, the model\nconsistently hides the genuine reasons behind its trading decision. We perform\na brief investigation of how this behavior varies under changes to the setting,\nsuch as removing model access to a reasoning scratchpad, attempting to prevent\nthe misaligned behavior by changing system instructions, changing the amount of\npressure the model is under, varying the perceived risk of getting caught, and\nmaking other simple changes to the environment. To our knowledge, this is the\nfirst demonstration of Large Language Models trained to be helpful, harmless,\nand honest, strategically deceiving their users in a realistic situation\nwithout direct instructions or training for deception.\n","authors":["Jérémy Scheurer","Mikita Balesni","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2311.07590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13547v3","updated":"2023-11-27T15:10:00Z","published":"2023-05-22T23:43:23Z","title":"Self-Evolution Learning for Mixup: Enhance Data Augmentation on Few-Shot\n Text Classification Tasks","summary":" Text classification tasks often encounter few shot scenarios with limited\nlabeled data, and addressing data scarcity is crucial. Data augmentation with\nmixup has shown to be effective on various text classification tasks. However,\nmost of the mixup methods do not consider the varying degree of learning\ndifficulty in different stages of training and generate new samples with one\nhot labels, resulting in the model over confidence. In this paper, we propose a\nself evolution learning (SE) based mixup approach for data augmentation in text\nclassification, which can generate more adaptive and model friendly pesudo\nsamples for the model training. SE focuses on the variation of the model's\nlearning ability. To alleviate the model confidence, we introduce a novel\ninstance specific label smoothing approach, which linearly interpolates the\nmodel's output and one hot labels of the original samples to generate new soft\nfor label mixing up. Through experimental analysis, in addition to improving\nclassification accuracy, we demonstrate that SE also enhances the model's\ngeneralize ability.\n","authors":["Haoqi Zheng","Qihuang Zhong","Liang Ding","Zhiliang Tian","Xin Niu","Dongsheng Li","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2305.13547v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15896v1","updated":"2023-11-27T15:01:26Z","published":"2023-11-27T15:01:26Z","title":"Data Generation for Post-OCR correction of Cyrillic handwriting","summary":" This paper introduces a novel approach to post-Optical Character Recognition\nCorrection (POC) for handwritten Cyrillic text, addressing a significant gap in\ncurrent research methodologies. This gap is due to the lack of large text\ncorporas that provide OCR errors for further training of language-based POC\nmodels, which are demanding in terms of corpora size. Our study primarily\nfocuses on the development and application of a synthetic handwriting\ngeneration engine based on B\\'ezier curves. Such an engine generates highly\nrealistic handwritten text in any amounts, which we utilize to create a\nsubstantial dataset by transforming Russian text corpora sourced from the\ninternet. We apply a Handwritten Text Recognition (HTR) model to this dataset\nto identify OCR errors, forming the basis for our POC model training. The\ncorrection model is trained on a 90-symbol input context, utilizing a\npre-trained T5 architecture with a seq2seq correction task. We evaluate our\napproach on HWR200 and School_notebooks_RU datasets as they provide significant\nchallenges in the HTR domain. Furthermore, POC can be used to highlight errors\nfor teachers, evaluating student performance. This can be done simply by\ncomparing sentences before and after correction, displaying differences in\ntext. Our primary contribution lies in the innovative use of B\\'ezier curves\nfor Cyrillic text generation and subsequent error correction using a\nspecialized POC model. We validate our approach by presenting Word Accuracy\nRate (WAR) and Character Accuracy Rate (CAR) results, both with and without\npost-OCR correction, using real open corporas of handwritten Cyrillic text.\nThese results, coupled with our methodology, are designed to be reproducible,\npaving the way for further advancements in the field of OCR and handwritten\ntext analysis. Paper contributions can be found in\nhttps://github.com/dbrainio/CyrillicHandwritingPOC\n","authors":["Evgenii Davydkin","Aleksandr Markelov","Egor Iuldashev","Anton Dudkin","Ivan Krivorotov"],"pdf_url":"https://arxiv.org/pdf/2311.15896v1.pdf","comment":"17 pages, 27 figures, 6 tables, 26 references"},{"id":"http://arxiv.org/abs/2307.15176v2","updated":"2023-11-27T14:35:05Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":" Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v2.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2310.14505v2","updated":"2023-11-27T14:23:16Z","published":"2023-10-23T02:32:30Z","title":"Sentiment analysis with adaptive multi-head attention in Transformer","summary":" We propose a novel framework based on the attention mechanism to identify the\nsentiment of a movie review document. Previous efforts on deep neural networks\nwith attention mechanisms focus on encoder and decoder with fixed numbers of\nmulti-head attention. Therefore, we need a mechanism to stop the attention\nprocess automatically if no more useful information can be read from the\nmemory.In this paper, we propose an adaptive multi-head attention architecture\n(AdaptAttn) which varies the number of attention heads based on length of\nsentences. AdaptAttn has a data preprocessing step where each document is\nclassified into any one of the three bins small, medium or large based on\nlength of the sentence. The document classified as small goes through two heads\nin each layer, the medium group passes four heads and the large group is\nprocessed by eight heads. We examine the merit of our model on the Stanford\nlarge movie review dataset. The experimental results show that the F1 score\nfrom our model is on par with the baseline model.\n","authors":["Fanfei Meng","David Demeter"],"pdf_url":"https://arxiv.org/pdf/2310.14505v2.pdf","comment":"Accepted by the 4th International Conference on Signal Processing and\n Machine Learning"},{"id":"http://arxiv.org/abs/2310.19106v3","updated":"2023-11-27T13:46:00Z","published":"2023-10-29T18:43:19Z","title":"PACuna: Automated Fine-Tuning of Language Models for Particle\n Accelerators","summary":" Navigating the landscape of particle accelerators has become increasingly\nchallenging with recent surges in contributions. These intricate devices\nchallenge comprehension, even within individual facilities. To address this, we\nintroduce PACuna, a fine-tuned language model refined through publicly\navailable accelerator resources like conferences, pre-prints, and books. We\nautomated data collection and question generation to minimize expert\ninvolvement and make the data publicly available. PACuna demonstrates\nproficiency in addressing intricate accelerator questions, validated by\nexperts. Our approach shows adapting language models to scientific domains by\nfine-tuning technical texts and auto-generated corpora capturing the latest\ndevelopments can further produce pre-trained models to answer some intricate\nquestions that commercially available assistants cannot and can serve as\nintelligent assistants for individual facilities.\n","authors":["Antonin Sulc","Raimund Kammering","Annika Eichler","Tim Wilksen"],"pdf_url":"https://arxiv.org/pdf/2310.19106v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15786v1","updated":"2023-11-27T13:01:59Z","published":"2023-11-27T13:01:59Z","title":"YUAN 2.0: A Large Language Model with Localized Filtering-based\n Attention","summary":" In this work, the Localized Filtering-based Attention (LFA) is introduced to\nincorporate prior knowledge of local dependencies of natural language into\nAttention. Based on LFA, we develop and release Yuan 2.0, a large language\nmodel with parameters ranging from 2.1 billion to 102.6 billion. A data\nfiltering and generation method is presented to build pretraining and\nfine-tuning dataset in high quality. A distributed training method with\nnon-uniform pipeline parallel, data parallel, and optimizer parallel is\nproposed, which greatly reduces the bandwidth requirements of intra-node\ncommunication, and achieves good performance in large-scale distributed\ntraining. Yuan 2.0 models display impressive ability in code generation, math\nproblem-solving, and chat compared with existing models. The latest version of\nYUAN 2.0, including model weights and source code, is accessible at Github.\n","authors":["Shaohua Wu","Xudong Zhao","Shenling Wang","Jiangang Luo","Lingjun Li","Xi Chen","Bing Zhao","Wei Wang","Tong Yu","Rongguo Zhang","Jiahua Zhang","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15781v1","updated":"2023-11-27T12:54:47Z","published":"2023-11-27T12:54:47Z","title":"Increasing Coverage and Precision of Textual Information in Multilingual\n Knowledge Graphs","summary":" Recent work in Natural Language Processing and Computer Vision has been using\ntextual information -- e.g., entity names and descriptions -- available in\nknowledge graphs to ground neural models to high-quality structured data.\nHowever, when it comes to non-English languages, the quantity and quality of\ntextual information are comparatively scarce. To address this issue, we\nintroduce the novel task of automatic Knowledge Graph Enhancement (KGE) and\nperform a thorough investigation on bridging the gap in both the quantity and\nquality of textual information between English and non-English languages. More\nspecifically, we: i) bring to light the problem of increasing multilingual\ncoverage and precision of entity names and descriptions in Wikidata; ii)\ndemonstrate that state-of-the-art methods, namely, Machine Translation (MT),\nWeb Search (WS), and Large Language Models (LLMs), struggle with this task;\niii) present M-NTA, a novel unsupervised approach that combines MT, WS, and\nLLMs to generate high-quality textual information; and, iv) study the impact of\nincreasing multilingual coverage and precision of non-English textual\ninformation in Entity Linking, Knowledge Graph Completion, and Question\nAnswering. As part of our effort towards better multilingual knowledge graphs,\nwe also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE\napproaches in 10 languages across 7 language families.\n","authors":["Simone Conia","Min Li","Daniel Lee","Umar Farooq Minhas","Ihab Ilyas","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2311.15781v1.pdf","comment":"Camera ready for EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15766v1","updated":"2023-11-27T12:37:51Z","published":"2023-11-27T12:37:51Z","title":"Knowledge Unlearning for LLMs: Tasks, Methods, and Challenges","summary":" In recent years, large language models (LLMs) have spurred a new research\nparadigm in natural language processing. Despite their excellent capability in\nknowledge-based question answering and reasoning, their potential to retain\nfaulty or even harmful knowledge poses risks of malicious application. The\nchallenge of mitigating this issue and transforming these models into purer\nassistants is crucial for their widespread applicability. Unfortunately,\nRetraining LLMs repeatedly to eliminate undesirable knowledge is impractical\ndue to their immense parameters. Knowledge unlearning, derived from analogous\nstudies on machine unlearning, presents a promising avenue to address this\nconcern and is notably advantageous in the context of LLMs. It allows for the\nremoval of harmful knowledge in an efficient manner, without affecting\nunrelated knowledge in the model. To this end, we provide a survey of knowledge\nunlearning in the era of LLMs. Firstly, we formally define the knowledge\nunlearning problem and distinguish it from related works. Subsequently, we\ncategorize existing knowledge unlearning methods into three classes: those\nbased on parameter optimization, parameter merging, and in-context learning,\nand introduce details of these unlearning methods. We further present\nevaluation datasets used in existing methods, and finally conclude this survey\nby presenting the ongoing challenges and future directions.\n","authors":["Nianwen Si","Hao Zhang","Heyu Chang","Wenlin Zhang","Dan Qu","Weiqiang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15766v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2311.15759v1","updated":"2023-11-27T12:29:20Z","published":"2023-11-27T12:29:20Z","title":"Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage\n and Sharing in LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have achieved\nsignificant multimodal generation capabilities, akin to GPT-4. These models\npredominantly map visual information into language representation space,\nleveraging the vast knowledge and powerful text generation abilities of LLMs to\nproduce multimodal instruction-following responses. We could term this method\nas LLMs for Vision because of its employing LLMs for visual-language\nunderstanding, yet observe that these MLLMs neglect the potential of harnessing\nvisual knowledge to enhance overall capabilities of LLMs, which could be\nregraded as Vision Enhancing LLMs. In this paper, we propose an approach called\nMKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage\nand Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a\ncomponent integrated into the internal blocks of LLMs, designed to store\nopen-world visual information efficiently. Additionally, we present a soft\nMixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal\nknowledge collaboration during generation. Our comprehensive experiments\ndemonstrate that MKS2 substantially augments the reasoning capabilities of LLMs\nin contexts necessitating physical or commonsense knowledge. It also delivers\ncompetitive results on multimodal benchmarks.\n","authors":["Yunxin Li","Baotian Hu","Wei Wang","Xiaochun Cao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15759v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2311.15723v1","updated":"2023-11-27T11:17:29Z","published":"2023-11-27T11:17:29Z","title":"Italian Crossword Generator: Enhancing Education through Interactive\n Word Puzzles","summary":" Educational crosswords offer numerous benefits for students, including\nincreased engagement, improved understanding, critical thinking, and memory\nretention. Creating high-quality educational crosswords can be challenging, but\nrecent advances in natural language processing and machine learning have made\nit possible to use language models to generate nice wordplays. The exploitation\nof cutting-edge language models like GPT3-DaVinci, GPT3-Curie, GPT3-Babbage,\nGPT3-Ada, and BERT-uncased has led to the development of a comprehensive system\nfor generating and verifying crossword clues. A large dataset of clue-answer\npairs was compiled to fine-tune the models in a supervised manner to generate\noriginal and challenging clues from a given keyword. On the other hand, for\ngenerating crossword clues from a given text, Zero/Few-shot learning techniques\nwere used to extract clues from the input text, adding variety and creativity\nto the puzzles. We employed the fine-tuned model to generate data and labeled\nthe acceptability of clue-answer parts with human supervision. To ensure\nquality, we developed a classifier by fine-tuning existing language models on\nthe labeled dataset. Conversely, to assess the quality of clues generated from\nthe given text using zero/few-shot learning, we employed a zero-shot learning\napproach to check the quality of generated clues. The results of the evaluation\nhave been very promising, demonstrating the effectiveness of the approach in\ncreating high-standard educational crosswords that offer students engaging and\nrewarding learning experiences.\n","authors":["Kamyar Zeinalipour","Tommaso laquinta","Asya Zanollo","Giovanni Angelini","Leonardo Rigutini","Marco Maggini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2311.15723v1.pdf","comment":"Accepted Paper for CLiC-it 2023 - 9th Italian Conference on\n Computational Linguistics"},{"id":"http://arxiv.org/abs/2311.15716v1","updated":"2023-11-27T10:59:16Z","published":"2023-11-27T10:59:16Z","title":"Justifiable Artificial Intelligence: Engineering Large Language Models\n for Legal Applications","summary":" In this work, I discuss how Large Language Models can be applied in the legal\ndomain, circumventing their current drawbacks. Despite their large success and\nacceptance, their lack of explainability hinders legal experts to trust in\ntheir output, and this happens rightfully so. However, in this paper, I argue\nin favor of a new view, Justifiable Artificial Intelligence, instead of\nfocusing on Explainable Artificial Intelligence. I discuss in this paper how\ngaining evidence for and against a Large Language Model's output may make their\ngenerated texts more trustworthy - or hold them accountable for misinformation.\n","authors":["Sabine Wehnert"],"pdf_url":"https://arxiv.org/pdf/2311.15716v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2310.01825v2","updated":"2023-11-27T10:39:13Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":" Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15698v1","updated":"2023-11-27T10:34:55Z","published":"2023-11-27T10:34:55Z","title":"Cerbero-7B: A Leap Forward in Language-Specific LLMs Through Enhanced\n Chat Corpus Generation and Evaluation","summary":" This study introduces a novel approach for generating high-quality,\nlanguage-specific chat corpora using a self-chat mechanism. We combine a\ngenerator LLM for creating new samples and an embedder LLM to ensure diversity.\nA new Masked Language Modelling (MLM) model-based quality assessment metric is\nproposed for evaluating and filtering the corpora. Utilizing the llama2-70b as\nthe generator and a multilingual sentence transformer as embedder, we generate\nan Italian chat corpus and refine the Fauno corpus, which is based on\ntranslated English ChatGPT self-chat data. The refinement uses structural\nassertions and Natural Language Processing techniques. Both corpora undergo a\ncomprehensive quality evaluation using the proposed MLM model-based quality\nmetric. The Italian LLM fine-tuned with these corpora demonstrates\nsignificantly enhanced language comprehension and question-answering skills.\nThe resultant model, cerbero-7b, establishes a new state-of-the-art for Italian\nLLMs. This approach marks a substantial advancement in the development of\nlanguage-specific LLMs, with a special emphasis on augmenting corpora for\nunderrepresented languages like Italian.\n","authors":["Federico A. Galatolo","Mario G. C. A. Cimino"],"pdf_url":"https://arxiv.org/pdf/2311.15698v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06422v2","updated":"2023-11-27T10:18:36Z","published":"2023-10-10T08:46:10Z","title":"Large Language Models for Propaganda Detection","summary":" The prevalence of propaganda in our digital society poses a challenge to\nsocietal harmony and the dissemination of truth. Detecting propaganda through\nNLP in text is challenging due to subtle manipulation techniques and contextual\ndependencies. To address this issue, we investigate the effectiveness of modern\nLarge Language Models (LLMs) such as GPT-3 and GPT-4 for propaganda detection.\nWe conduct experiments using the SemEval-2020 task 11 dataset, which features\nnews articles labeled with 14 propaganda techniques as a multi-label\nclassification problem. Five variations of GPT-3 and GPT-4 are employed,\nincorporating various prompt engineering and fine-tuning strategies across the\ndifferent models. We evaluate the models' performance by assessing metrics such\nas $F1$ score, $Precision$, and $Recall$, comparing the results with the\ncurrent state-of-the-art approach using RoBERTa. Our findings demonstrate that\nGPT-4 achieves comparable results to the current state-of-the-art. Further,\nthis study analyzes the potential and challenges of LLMs in complex tasks like\npropaganda detection.\n","authors":["Kilian Sprenkamp","Daniel Gordon Jones","Liudmila Zavolokina"],"pdf_url":"https://arxiv.org/pdf/2310.06422v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15653v1","updated":"2023-11-27T09:33:13Z","published":"2023-11-27T09:33:13Z","title":"MoDS: Model-oriented Data Selection for Instruction Tuning","summary":" Instruction tuning has become the de facto method to equip large language\nmodels (LLMs) with the ability of following user instructions. Usually,\nhundreds of thousands or millions of instruction-following pairs are employed\nto fine-tune the foundation LLMs. Recently, some studies show that a small\nnumber of high-quality instruction data is enough. However, how to select\nappropriate instruction data for a given LLM is still an open problem. To\naddress this problem, in this paper we present a model-oriented data selection\n(MoDS) approach, which selects instruction data based on a new criteria\nconsidering three aspects: quality, coverage and necessity. First, our approach\nutilizes a quality evaluation model to filter out the high-quality subset from\nthe original instruction dataset, and then designs an algorithm to further\nselect from the high-quality subset a seed instruction dataset with good\ncoverage. The seed dataset is applied to fine-tune the foundation LLM to obtain\nan initial instruction-following LLM. Finally, we develop a necessity\nevaluation model to find out the instruction data which are performed badly in\nthe initial instruction-following LLM and consider them necessary instructions\nto further improve the LLMs. In this way, we can get a small high-quality,\nbroad-coverage and high-necessity subset from the original instruction\ndatasets. Experimental results show that, the model fine-tuned with 4,000\ninstruction pairs selected by our approach could perform better than the model\nfine-tuned with the full original dataset which includes 214k instruction data.\n","authors":["Qianlong Du","Chengqing Zong","Jiajun Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15648v1","updated":"2023-11-27T09:20:12Z","published":"2023-11-27T09:20:12Z","title":"Reinforcement Learning from Diffusion Feedback: Q* for Image Search","summary":" Large vision-language models are steadily gaining personalization\ncapabilities at the cost of fine-tuning or data augmentation. We present two\nmodels for image generation using model-agnostic learning that align semantic\npriors with generative capabilities. RLDF, or Reinforcement Learning from\nDiffusion Feedback, is a singular approach for visual imitation through\nprior-preserving reward function guidance. This employs Q-learning (with\nstandard Q*) for generation and follows a semantic-rewarded trajectory for\nimage search through finite encoding-tailored actions. The second proposed\nmethod, noisy diffusion gradient, is optimization driven. At the root of both\nmethods is a special CFG encoding that we propose for continual semantic\nguidance. Using only a single input image and no text input, RLDF generates\nhigh-quality images over varied domains including retail, sports and\nagriculture showcasing class-consistency and strong visual diversity. Project\nwebsite is available at https://infernolia.github.io/RLDF.\n","authors":["Aboli Marathe"],"pdf_url":"https://arxiv.org/pdf/2311.15648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15642v1","updated":"2023-11-27T09:12:35Z","published":"2023-11-27T09:12:35Z","title":"InfoPattern: Unveiling Information Propagation Patterns in Social Media","summary":" Social media play a significant role in shaping public opinion and\ninfluencing ideological communities through information propagation. Our demo\nInfoPattern centers on the interplay between language and human ideology. The\ndemo (Code: https://github.com/blender-nlp/InfoPattern ) is capable of: (1) red\nteaming to simulate adversary responses from opposite ideology communities; (2)\nstance detection to identify the underlying political sentiments in each\nmessage; (3) information propagation graph discovery to reveal the evolution of\nclaims across various communities over time. (Live Demo:\nhttps://incas.csl.illinois.edu/blender/About )\n","authors":["Chi Han","Jialiang Xu","Manling Li","Hanning Zhang","Tarek Abdelzaher","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2311.15642v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17255v3","updated":"2023-11-27T08:57:10Z","published":"2023-09-29T14:03:34Z","title":"Knowledge Graphs for the Life Sciences: Recent Developments, Challenges\n and Opportunities","summary":" The term life sciences refers to the disciplines that study living organisms\nand life processes, and include chemistry, biology, medicine, and a range of\nother related disciplines. Research efforts in life sciences are heavily\ndata-driven, as they produce and consume vast amounts of scientific data, much\nof which is intrinsically relational and graph-structured.\n The volume of data and the complexity of scientific concepts and relations\nreferred to therein promote the application of advanced knowledge-driven\ntechnologies for managing and interpreting data, with the ultimate aim to\nadvance scientific discovery.\n In this survey and position paper, we discuss recent developments and\nadvances in the use of graph-based technologies in life sciences and set out a\nvision for how these technologies will impact these fields into the future. We\nfocus on three broad topics: the construction and management of Knowledge\nGraphs (KGs), the use of KGs and associated technologies in the discovery of\nnew knowledge, and the use of KGs in artificial intelligence applications to\nsupport explanations (explainable AI). We select a few exemplary use cases for\neach topic, discuss the challenges and open research questions within these\ntopics, and conclude with a perspective and outlook that summarizes the\noverarching challenges and their potential solutions as a guide for future\nresearch.\n","authors":["Jiaoyan Chen","Hang Dong","Janna Hastings","Ernesto Jiménez-Ruiz","Vanessa López","Pierre Monnin","Catia Pesquita","Petr Škoda","Valentina Tamma"],"pdf_url":"https://arxiv.org/pdf/2309.17255v3.pdf","comment":"33 pages, 1 figure, accepted for Transactions on Graph Data and\n Knowledge (TGDK)"},{"id":"http://arxiv.org/abs/2311.15626v1","updated":"2023-11-27T08:45:31Z","published":"2023-11-27T08:45:31Z","title":"The WebCrow French Crossword Solver","summary":" Crossword puzzles are one of the most popular word games, played in different\nlanguages all across the world, where riddle style can vary significantly from\none country to another. Automated crossword resolution is challenging, and\ntypical solvers rely on large databases of previously solved crosswords. In\nthis work, we extend WebCrow 2.0, an automatic crossword solver, to French,\nmaking it the first program for crossword solving in the French language. To\ncope with the lack of a large repository of clue-answer crossword data, WebCrow\n2.0 exploits multiple modules, called experts, that retrieve candidate answers\nfrom heterogeneous resources, such as the web, knowledge graphs, and linguistic\nrules. We compared WebCrow's performance against humans in two different\nchallenges. Despite the limited amount of past crosswords, French WebCrow was\ncompetitive, actually outperforming humans in terms of speed and accuracy, thus\nproving its capabilities to generalize to new languages.\n","authors":["Giovanni Angelini","Marco Ernandes","Tommaso laquinta","Caroline Stehlé","Fanny Simões","Kamyar Zeinalipour","Andrea Zugarini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2311.15626v1.pdf","comment":"Accepted Paper for EAI Intetain 2023 - 14th EAI International\n Conference on Intelligent Technologies for Interactive Entertainment"},{"id":"http://arxiv.org/abs/2311.15623v1","updated":"2023-11-27T08:38:42Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocesses lack transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nnecessitate annotations or additional training data. The injection of the\nextracted knowledge necessitates the addition of only simple neural modules. We\nemploy the Convex Polytopic Model (CPM) as a feature extraction tool for DST\ntasks and illustrate that the acquired features correlate with the syntactic\nand semantic patterns in the dialogues. This correlation facilitates a\ncomprehensive understanding of the linguistic features influencing the DST\nmodel's decision-making process. We benchmark this framework on various DST\ntasks and observe a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15992v2","updated":"2023-11-27T08:30:00Z","published":"2023-07-29T14:11:15Z","title":"Towards Codable Watermarking for Injecting Multi-bit Information to LLM","summary":" As large language models (LLMs) generate texts with increasing fluency and\nrealism, there is a growing need to identify the source of texts to prevent the\nabuse of LLMs. Text watermarking techniques have proven reliable in\ndistinguishing whether a text is generated by LLMs by injecting hidden patterns\ninto the generated texts. However, we argue that existing watermarking methods\nfor LLMs are encoding-inefficient (only contain one bit of information -\nwhether it is generated from an LLM or not) and cannot flexibly meet the\ndiverse information encoding needs (such as encoding model version, generation\ntime, user id, etc.) in different LLMs application scenarios. In this work, we\nconduct the first systematic study on the topic of Codable Text Watermarking\nfor LLMs (CTWL) that allows text watermarks to carry more customizable\ninformation. First of all, we study the taxonomy of LLM watermarking technology\nand give a mathematical formulation for CTWL. Additionally, we provide a\ncomprehensive evaluation system for CTWL: (1) watermarking success rate, (2)\nrobustness against various corruptions, (3) coding rate of payload information,\n(4) encoding and decoding efficiency, (5) impacts on the quality of the\ngenerated text. To meet the requirements of these non-Pareto-improving metrics,\nwe devise a CTWL method named Balance-Marking, based on the motivation of\nensuring that available and unavailable vocabularies for encoding information\nhave approximately equivalent probabilities. Compared to the random vocabulary\npartitioning extended from the existing work, a probability-balanced vocabulary\npartition can significantly improve the quality of the generated text.\nExtensive experimental results have shown that our method outperforms a direct\nbaseline under comprehensive evaluation.\n","authors":["Lean Wang","Wenkai Yang","Deli Chen","Hao Zhou","Yankai Lin","Fandong Meng","Jie Zhou","Xu Sun"],"pdf_url":"https://arxiv.org/pdf/2307.15992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15614v1","updated":"2023-11-27T08:23:08Z","published":"2023-11-27T08:23:08Z","title":"FreeAL: Towards Human-Free Active Learning in the Era of Large Language\n Models","summary":" Collecting high-quality labeled data for model training is notoriously\ntime-consuming and labor-intensive for various NLP tasks. While copious\nsolutions, such as active learning for small language models (SLMs) and\nprevalent in-context learning in the era of large language models (LLMs), have\nbeen proposed and alleviate the labeling burden to some extent, their\nperformances are still subject to human intervention. It is still underexplored\nhow to reduce the annotation cost in the LLMs era. To bridge this, we\nrevolutionize traditional active learning and propose an innovative\ncollaborative learning framework FreeAL to interactively distill and filter the\ntask-specific knowledge from LLMs. During collaborative training, an LLM serves\nas an active annotator inculcating its coarse-grained knowledge, while a\ndownstream SLM is incurred as a student to filter out high-quality in-context\nsamples to feedback LLM for the subsequent label refinery. Extensive\nexperiments on eight benchmark datasets demonstrate that FreeAL largely\nenhances the zero-shot performances for both SLM and LLM without any human\nsupervision. The code is available at https://github.com/Justherozen/FreeAL .\n","authors":["Ruixuan Xiao","Yiwen Dong","Junbo Zhao","Runze Wu","Minmin Lin","Gang Chen","Haobo Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15614v1.pdf","comment":"Accepted to EMNLP 2023 (Main conference)"},{"id":"http://arxiv.org/abs/2311.15596v1","updated":"2023-11-27T07:44:25Z","published":"2023-11-27T07:44:25Z","title":"Can Vision-Language Models Think from a First-Person Perspective?","summary":" Vision-language models (VLMs) have recently shown promising results in\ntraditional downstream tasks. Evaluation studies have emerged to assess their\nabilities, with the majority focusing on the third-person perspective, and only\na few addressing specific tasks from the first-person perspective. However, the\ncapability of VLMs to \"think\" from a first-person perspective, a crucial\nattribute for advancing autonomous agents and robotics, remains largely\nunexplored. To bridge this research gap, we introduce EgoThink, a novel visual\nquestion-answering benchmark that encompasses six core capabilities with twelve\ndetailed dimensions. The benchmark is constructed using selected clips from\negocentric videos, with manually annotated question-answer pairs containing\nfirst-person information. To comprehensively assess VLMs, we evaluate eighteen\npopular VLMs on EgoThink. Moreover, given the open-ended format of the answers,\nwe use GPT-4 as the automatic judge to compute single-answer grading.\nExperimental results indicate that although GPT-4V leads in numerous\ndimensions, all evaluated VLMs still possess considerable potential for\nimprovement in first-person perspective tasks. Meanwhile, enlarging the number\nof trainable parameters has the most significant impact on model performance on\nEgoThink. In conclusion, EgoThink serves as a valuable addition to existing\nevaluation benchmarks for VLMs, providing an indispensable resource for future\nresearch in the realm of embodied artificial intelligence and robotics.\n","authors":["Sijie Cheng","Zhicheng Guo","Jingwen Wu","Kechen Fang","Peng Li","Huaping Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11509v2","updated":"2023-11-27T06:53:03Z","published":"2023-11-20T03:17:21Z","title":"Token-Level Adversarial Prompt Detection Based on Perplexity Measures\n and Contextual Information","summary":" In recent years, Large Language Models (LLM) have emerged as pivotal tools in\nvarious applications. However, these models are susceptible to adversarial\nprompt attacks, where attackers can carefully curate input strings that lead to\nundesirable outputs. The inherent vulnerability of LLMs stems from their\ninput-output mechanisms, especially when presented with intensely\nout-of-distribution (OOD) inputs. This paper proposes a token-level detection\nmethod to identify adversarial prompts, leveraging the LLM's capability to\npredict the next token's probability. We measure the degree of the model's\nperplexity and incorporate neighboring token information to encourage the\ndetection of contiguous adversarial prompt sequences. As a result, we propose\ntwo methods: one that identifies each token as either being part of an\nadversarial prompt or not, and another that estimates the probability of each\ntoken being part of an adversarial prompt.\n","authors":["Zhengmian Hu","Gang Wu","Saayan Mitra","Ruiyi Zhang","Tong Sun","Heng Huang","Viswanathan Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2311.11509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01889v4","updated":"2023-11-27T06:38:47Z","published":"2023-10-03T08:44:50Z","title":"Ring Attention with Blockwise Transformers for Near-Infinite Context","summary":" Transformers have emerged as the architecture of choice for many\nstate-of-the-art AI models, showcasing exceptional performance across a wide\nrange of AI applications. However, the memory demands imposed by Transformers\nlimit their ability to handle long sequences, thereby posing challenges in\nutilizing videos, actions, and other long-form sequences and modalities in\ncomplex environments. We present a novel approach, Ring Attention with\nBlockwise Transformers (Ring Attention), which leverages blockwise computation\nof self-attention and feedforward to distribute long sequences across multiple\ndevices while fully overlapping the communication of key-value blocks with the\ncomputation of blockwise attention. Our approach enables training and inference\nof sequences that are up to device count times longer than those achievable by\nprior memory-efficient Transformers, without resorting to approximations or\nincurring additional communication and computation overheads. Extensive\nexperiments on language modeling and reinforcement learning tasks demonstrate\nthe effectiveness of our approach in allowing millions of tokens context size\nand improving performance.\n","authors":["Hao Liu","Matei Zaharia","Pieter Abbeel"],"pdf_url":"https://arxiv.org/pdf/2310.01889v4.pdf","comment":"Code: https://github.com/lhao499/llm_large_context"},{"id":"http://arxiv.org/abs/2311.15566v1","updated":"2023-11-27T06:31:17Z","published":"2023-11-27T06:31:17Z","title":"SpotServe: Serving Generative Large Language Models on Preemptible\n Instances","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them cheaply. This paper aims to\nreduce the monetary cost for serving LLMs by leveraging preemptible GPU\ninstances on modern clouds, which offer accesses to spare GPUs at a much\ncheaper price than regular instances but may be preempted by the cloud at any\ntime. Serving LLMs on preemptible instances requires addressing challenges\ninduced by frequent instance preemptions and the necessity of migrating\ninstances to handle these preemptions.\n This paper presents SpotServe, the first distributed LLM serving system on\npreemptible instances. Several key techniques in SpotServe realize fast and\nreliable serving of generative LLMs on cheap preemptible instances. First,\nSpotServe dynamically adapts the LLM parallelization configuration for dynamic\ninstance availability and fluctuating workload, while balancing the trade-off\namong the overall throughput, inference latency and monetary costs. Second, to\nminimize the cost of migrating instances for dynamic reparallelization, the\ntask of migrating instances is formulated as a bipartite graph matching\nproblem, which uses the Kuhn-Munkres algorithm to identify an optimal migration\nplan that minimizes communications. Finally, to take advantage of the grace\nperiod offered by modern clouds, we introduce stateful inference recovery, a\nnew inference mechanism that commits inference progress at a much finer\ngranularity and allows SpotServe to cheaply resume inference upon preemption.\nWe evaluate on real spot instance preemption traces and various popular LLMs\nand show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared\nwith the best existing LLM serving systems. We also show that SpotServe can\nleverage the price advantage of preemptive instances, saving 54% monetary cost\ncompared with only using on-demand instances.\n","authors":["Xupeng Miao","Chunan Shi","Jiangfei Duan","Xiaoli Xi","Dahua Lin","Bin Cui","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2311.15566v1.pdf","comment":"ASPLOS 2024"},{"id":"http://arxiv.org/abs/2311.15565v1","updated":"2023-11-27T06:26:53Z","published":"2023-11-27T06:26:53Z","title":"Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing\n AI-Generated Text","summary":" My research investigates the use of cutting-edge hybrid deep learning models\nto accurately differentiate between AI-generated text and human writing. I\napplied a robust methodology, utilising a carefully selected dataset comprising\nAI and human texts from various sources, each tagged with instructions.\nAdvanced natural language processing techniques facilitated the analysis of\ntextual features. Combining sophisticated neural networks, the custom model\nenabled it to detect nuanced differences between AI and human content.\n","authors":["Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2311.15565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15564v1","updated":"2023-11-27T06:22:57Z","published":"2023-11-27T06:22:57Z","title":"Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval","summary":" Neural 'dense' retrieval models are state of the art for many datasets,\nhowever these models often exhibit limited domain transfer ability. Existing\napproaches to adaptation are unwieldy, such as requiring explicit supervision,\ncomplex model architectures, or massive external models. We present\n$\\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage\nretrieval in zero-shot settings. Our technique follows a straightforward loop:\na dense retriever learns from supervision signals provided by a reranker, and\nsubsequently, the reranker is updated based on feedback from the improved\nretriever. By iterating this loop, the two components mutually enhance one\nanother's performance. Experimental results demonstrate that our unsupervised\n$\\texttt{ABEL}$ model outperforms both leading supervised and unsupervised\nretrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation\nabilities to tasks and domains that were unseen during training. By either\nfine-tuning $\\texttt{ABEL}$ on labelled data or integrating it with existing\nsupervised dense retrievers, we achieve state-of-the-art\nresults.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/BootSwitch}.}\n","authors":["Fan Jiang","Qiongkai Xu","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15564v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.15563v1","updated":"2023-11-27T06:19:50Z","published":"2023-11-27T06:19:50Z","title":"Noisy Self-Training with Synthetic Queries for Dense Retrieval","summary":" Although existing neural retrieval models reveal promising results when\ntraining data is abundant and the performance keeps improving as training data\nincreases, collecting high-quality annotated data is prohibitively costly. To\nthis end, we introduce a novel noisy self-training framework combined with\nsynthetic queries, showing that neural retrievers can be improved in a\nself-evolution manner with no reliance on any external models. Experimental\nresults show that our method improves consistently over existing methods on\nboth general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval\nbenchmarks. Extra analysis on low-resource settings reveals that our method is\ndata efficient and outperforms competitive baselines, with as little as 30% of\nlabelled training data. Further extending the framework for reranker training\ndemonstrates that the proposed method is general and yields additional gains on\ntasks of diverse domains.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/Self-Training-DPR}}\n","authors":["Fan Jiang","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15563v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.15548v1","updated":"2023-11-27T05:27:13Z","published":"2023-11-27T05:27:13Z","title":"Deficiency of Large Language Models in Finance: An Empirical Examination\n of Hallucination","summary":" The hallucination issue is recognized as a fundamental deficiency of large\nlanguage models (LLMs), especially when applied to fields such as finance,\neducation, and law. Despite the growing concerns, there has been a lack of\nempirical investigation. In this paper, we provide an empirical examination of\nLLMs' hallucination behaviors in financial tasks. First, we empirically\ninvestigate LLM model's ability of explaining financial concepts and\nterminologies. Second, we assess LLM models' capacity of querying historical\nstock prices. Third, to alleviate the hallucination issue, we evaluate the\nefficacy of four practical methods, including few-shot learning, Decoding by\nContrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method\nand the prompt-based tool learning method for a function to generate a query\ncommand. Finally, our major finding is that off-the-shelf LLMs experience\nserious hallucination behaviors in financial tasks. Therefore, there is an\nurgent need to call for research efforts in mitigating LLMs' hallucination.\n","authors":["Haoqiang Kang","Xiao-Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15544v1","updated":"2023-11-27T05:20:47Z","published":"2023-11-27T05:20:47Z","title":"The effect of source disclosure on evaluation of AI-generated messages:\n A two-part study","summary":" Advancements in artificial intelligence (AI) over the last decade demonstrate\nthat machines can exhibit communicative behavior and influence how humans\nthink, feel, and behave. In fact, the recent development of ChatGPT has shown\nthat large language models (LLMs) can be leveraged to generate high-quality\ncommunication content at scale and across domains, suggesting that they will be\nincreasingly used in practice. However, many questions remain about how knowing\nthe source of the messages influences recipients' evaluation of and preference\nfor AI-generated messages compared to human-generated messages. This paper\ninvestigated this topic in the context of vaping prevention messaging. In Study\n1, which was pre-registered, we examined the influence of source disclosure on\npeople's evaluation of AI-generated health prevention messages compared to\nhuman-generated messages. We found that source disclosure (i.e., labeling the\nsource of a message as AI vs. human) significantly impacted the evaluation of\nthe messages but did not significantly alter message rankings. In a follow-up\nstudy (Study 2), we examined how the influence of source disclosure may vary by\nthe participants' negative attitudes towards AI. We found a significant\nmoderating effect of negative attitudes towards AI on message evaluation, but\nnot for message selection. However, for those with moderate levels of negative\nattitudes towards AI, source disclosure decreased the preference for\nAI-generated messages. Overall, the results of this series of studies showed a\nslight bias against AI-generated messages once the source was disclosed, adding\nto the emerging area of study that lies at the intersection of AI and\ncommunication.\n","authors":["Sue Lim","Ralf Schmälzle"],"pdf_url":"https://arxiv.org/pdf/2311.15544v1.pdf","comment":"Manuscript currently under review. Paper presented at 109th Annual\n National Communication Association (NCA) Conference, November 16-19, 2023. 10\n pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.01947v2","updated":"2023-11-27T05:03:31Z","published":"2023-09-05T04:47:55Z","title":"TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression\n For On-device ASR Models","summary":" Automatic Speech Recognition (ASR) models need to be optimized for specific\nhardware before they can be deployed on devices. This can be done by tuning the\nmodel's hyperparameters or exploring variations in its architecture.\nRe-training and re-validating models after making these changes can be a\nresource-intensive task. This paper presents TODM (Train Once Deploy Many), a\nnew approach to efficiently train many sizes of hardware-friendly on-device ASR\nmodels with comparable GPU-hours to that of a single training job. TODM\nleverages insights from prior work on Supernet, where Recurrent Neural Network\nTransducer (RNN-T) models share weights within a Supernet. It reduces layer\nsizes and widths of the Supernet to obtain subnetworks, making them smaller\nmodels suitable for all hardware types. We introduce a novel combination of\nthree techniques to improve the outcomes of the TODM Supernet: adaptive\ndropouts, an in-place Alpha-divergence knowledge distillation, and the use of\nScaledAdam optimizer. We validate our approach by comparing Supernet-trained\nversus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using\nLibriSpeech. Results demonstrate that our TODM Supernet either matches or\nsurpasses the performance of manually tuned models by up to a relative of 3%\nbetter in word error rate (WER), while efficiently keeping the cost of training\nmany models at a small constant.\n","authors":["Yuan Shangguan","Haichuan Yang","Danni Li","Chunyang Wu","Yassir Fathullah","Dilin Wang","Ayushi Dalmia","Raghuraman Krishnamoorthi","Ozlem Kalinli","Junteng Jia","Jay Mahadeokar","Xin Lei","Mike Seltzer","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.01947v2.pdf","comment":"Meta AI; Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2310.18332v2","updated":"2023-11-27T04:22:54Z","published":"2023-10-20T12:44:44Z","title":"WordArt Designer: User-Driven Artistic Typography Synthesis using Large\n Language Models","summary":" This paper introduces WordArt Designer, a user-driven framework for artistic\ntypography synthesis, relying on the Large Language Model (LLM). The system\nincorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo\nmodules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets\nuser inputs and generates actionable prompts for the other modules, thereby\ntransforming abstract concepts into tangible designs. 2) The SemTypo module\noptimizes font designs using semantic concepts, striking a balance between\nartistic transformation and readability. 3) Building on the semantic layout\nprovided by the SemTypo module, the StyTypo module creates smooth, refined\nimages. 4) The TexTypo module further enhances the design's aesthetics through\ntexture rendering, enabling the generation of inventive textured fonts.\nNotably, WordArt Designer highlights the fusion of generative AI with artistic\ntypography. Experience its capabilities on ModelScope:\nhttps://www.modelscope.cn/studios/WordArt/WordArt.\n","authors":["Jun-Yan He","Zhi-Qi Cheng","Chenyang Li","Jingdong Sun","Wangmeng Xiang","Xianhui Lin","Xiaoyang Kang","Zengke Jin","Yusen Hu","Bin Luo","Yifeng Geng","Xuansong Xie","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.18332v2.pdf","comment":"Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is\n at https://www.modelscope.cn/studios/WordArt/WordArt"},{"id":"http://arxiv.org/abs/2311.15525v1","updated":"2023-11-27T04:01:13Z","published":"2023-11-27T04:01:13Z","title":"Overview of the VLSP 2022 -- Abmusu Shared Task: A Data Challenge for\n Vietnamese Abstractive Multi-document Summarization","summary":" This paper reports the overview of the VLSP 2022 - Vietnamese abstractive\nmulti-document summarization (Abmusu) shared task for Vietnamese News. This\ntask is hosted at the 9$^{th}$ annual workshop on Vietnamese Language and\nSpeech Processing (VLSP 2022). The goal of Abmusu shared task is to develop\nsummarization systems that could create abstractive summaries automatically for\na set of documents on a topic. The model input is multiple news documents on\nthe same topic, and the corresponding output is a related abstractive summary.\nIn the scope of Abmusu shared task, we only focus on Vietnamese news\nsummarization and build a human-annotated dataset of 1,839 documents in 600\nclusters, collected from Vietnamese news in 8 categories. Participated models\nare evaluated and ranked in terms of \\texttt{ROUGE2-F1} score, the most typical\nevaluation metric for document summarization problem.\n","authors":["Mai-Vu Tran","Hoang-Quynh Le","Duy-Cat Can","Quoc-An Nguyen"],"pdf_url":"https://arxiv.org/pdf/2311.15525v1.pdf","comment":"VLSP 2022"},{"id":"http://arxiv.org/abs/2311.15513v1","updated":"2023-11-27T03:17:09Z","published":"2023-11-27T03:17:09Z","title":"A Comparative and Experimental Study on Automatic Question Answering\n Systems and its Robustness against Word Jumbling","summary":" Question answer generation using Natural Language Processing models is\nubiquitous in the world around us. It is used in many use cases such as the\nbuilding of chat bots, suggestive prompts in google search and also as a way of\nnavigating information in banking mobile applications etc. It is highly\nrelevant because a frequently asked questions (FAQ) list can only have a finite\namount of questions but a model which can perform question answer generation\ncould be able to answer completely new questions that are within the scope of\nthe data. This helps us to be able to answer new questions accurately as long\nas it is a relevant question. In commercial applications, it can be used to\nincrease customer satisfaction and ease of usage. However a lot of data is\ngenerated by humans so it is susceptible to human error and this can adversely\naffect the model's performance and we are investigating this through our work\n","authors":["Shashidhar Reddy Javaji","Haoran Hu","Sai Sameer Vennam","Vijaya Gajanan Buddhavarapu"],"pdf_url":"https://arxiv.org/pdf/2311.15513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15509v1","updated":"2023-11-27T03:08:41Z","published":"2023-11-27T03:08:41Z","title":"A Corpus for Named Entity Recognition in Chinese Novels with\n Multi-genres","summary":" Entities like person, location, organization are important for literary text\nanalysis. The lack of annotated data hinders the progress of named entity\nrecognition (NER) in literary domain. To promote the research of literary NER,\nwe build the largest multi-genre literary NER corpus containing 263,135\nentities in 105,851 sentences from 260 online Chinese novels spanning 13\ndifferent genres. Based on the corpus, we investigate characteristics of\nentities from different genres. We propose several baseline NER models and\nconduct cross-genre and cross-domain experiments. Experimental results show\nthat genre difference significantly impact NER performance though not as much\nas domain difference like literary domain and news domain. Compared with NER in\nnews domain, literary NER still needs much improvement and the\nOut-of-Vocabulary (OOV) problem is more challenging due to the high variety of\nentities in literary works.\n","authors":["Hanjie Zhao","Jinge Xie","Yuchen Yan","Yuxiang Jia","Yawen Ye","Hongying Zan"],"pdf_url":"https://arxiv.org/pdf/2311.15509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15507v1","updated":"2023-11-27T03:05:48Z","published":"2023-11-27T03:05:48Z","title":"Improving Word Sense Disambiguation in Neural Machine Translation with\n Salient Document Context","summary":" Lexical ambiguity is a challenging and pervasive problem in machine\ntranslation (\\mt). We introduce a simple and scalable approach to resolve\ntranslation ambiguity by incorporating a small amount of extra-sentential\ncontext in neural \\mt. Our approach requires no sense annotation and no change\nto standard model architectures. Since actual document context is not available\nfor the vast majority of \\mt training data, we collect related sentences for\neach input to construct pseudo-documents. Salient words from pseudo-documents\nare then encoded as a prefix to each source sentence to condition the\ngeneration of the translation. To evaluate, we release \\docmucow, a challenge\nset for translation disambiguation based on the English-German \\mucow\n\\cite{raganato-etal-2020-evaluation} augmented with document IDs. Extensive\nexperiments show that our method translates ambiguous source words better than\nstrong sentence-level baselines and comparable document-level baselines while\nreducing training costs.\n","authors":["Elijah Rippeth","Marine Carpuat","Kevin Duh","Matt Post"],"pdf_url":"https://arxiv.org/pdf/2311.15507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15500v1","updated":"2023-11-27T02:55:34Z","published":"2023-11-27T02:55:34Z","title":"Function-constrained Program Synthesis","summary":" This work introduces (1) a technique that allows large language models (LLMs)\nto leverage user-provided code when solving programming tasks and (2) a method\nto iteratively generate modular sub-functions that can aid future code\ngeneration attempts when the initial code generated by the LLM is inadequate.\nGenerating computer programs in general-purpose programming languages like\nPython poses a challenge for LLMs when instructed to use code provided in the\nprompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code\ncompletions in real-time by drawing on all code available in a development\nenvironment. However, restricting code-specific LLMs to use only in-context\ncode is not straightforward, as the model is not explicitly instructed to use\nthe user-provided code and users cannot highlight precisely which snippets of\ncode the model should incorporate into its context. Moreover, current systems\nlack effective recovery methods, forcing users to iteratively re-prompt the\nmodel with modified prompts until a sufficient solution is reached. Our method\ndiffers from traditional LLM-powered code-generation by constraining\ncode-generation to an explicit function set and enabling recovery from failed\nattempts through automatically generated sub-functions. When the LLM cannot\nproduce working code, we generate modular sub-functions to aid subsequent\nattempts at generating functional code. A by-product of our method is a library\nof reusable sub-functions that can solve related tasks, imitating a software\nteam where efficiency scales with experience. We also introduce a new\n\"half-shot\" evaluation paradigm that provides tighter estimates of LLMs' coding\nabilities compared to traditional zero-shot evaluation. Our proposed evaluation\nmethod encourages models to output solutions in a structured format, decreasing\nsyntax errors that can be mistaken for poor coding ability.\n","authors":["Patrick Hajali","Ignas Budvytis"],"pdf_url":"https://arxiv.org/pdf/2311.15500v1.pdf","comment":"17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop"},{"id":"http://arxiv.org/abs/2311.13534v2","updated":"2023-11-27T02:52:46Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose a novel method which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging (namely LM-Cocktail), where the fine-tuned language model\nis merged with the pre-trained base model or the peer models from other domains\nthrough weighted average. Despite simplicity, LM-Cocktail is surprisingly\neffective: the resulted model is able to achieve a strong empirical performance\nin the whole scope of general tasks while preserving a superior capacity in its\ntargeted domain. We conduct comprehensive experiments with LLama and BGE model\non popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15490v1","updated":"2023-11-27T02:17:11Z","published":"2023-11-27T02:17:11Z","title":"Optimizing and Fine-tuning Large Language Model for Urban Renewal","summary":" This study aims to innovatively explore adaptive applications of large\nlanguage models (LLM) in urban renewal. It also aims to improve its performance\nand text generation quality for knowledge question-answering (QA) tasks. Based\non the ChatGLM, we automatically generate QA datasets using urban renewal\nscientific literature corpora in a self-instruct manner and then conduct joint\nfine-tuning training on the model using the Prefix and LoRA fine-tuning methods\nto create an LLM for urban renewal. By guiding the LLM to automatically\ngenerate QA data based on prompt words and given text, it is possible to\nquickly obtain datasets in the urban renewal field and provide data support for\nthe fine-tuning training of LLMs. The experimental results show that the joint\nfine-tuning training method proposed in this study can significantly improve\nthe performance of LLM on the QA tasks. Compared with LoRA fine-tuning, the\nmethod improves the Bleu and Rouge metrics on the test by about 5%; compared\nwith the model before fine-tuning, the method improves the Bleu and Rouge\nmetrics by about 15%-20%. This study demonstrates the effectiveness and\nsuperiority of the joint fine-tuning method using Prefix and LoRA for ChatGLM\nin the urban renewal knowledge QA tasks. It provides a new approach for\nfine-tuning LLMs on urban renewal-related tasks.\n","authors":["Xi Wang","Xianyao Ling","Tom Zhang","Xuecao Li","Shaolan Wang","Zhixing Li","Liang Zhang","Peng Gong"],"pdf_url":"https://arxiv.org/pdf/2311.15490v1.pdf","comment":"11 pages, 2 figures, 2 tables, 41 references"},{"id":"http://arxiv.org/abs/2311.15480v1","updated":"2023-11-27T01:44:02Z","published":"2023-11-27T01:44:02Z","title":"Automatic Time Signature Determination for New Scores Using Lyrics for\n Latent Rhythmic Structure","summary":" There has recently been a sharp increase in interest in Artificial\nIntelligence-Generated Content (AIGC). Despite this, musical components such as\ntime signatures have not been studied sufficiently to form an algorithmic\ndetermination approach for new compositions, especially lyrical songs. This is\nlikely because of the neglect of musical details, which is critical for\nconstructing a robust framework. Specifically, time signatures establish the\nfundamental rhythmic structure for almost all aspects of a song, including the\nphrases and notes. In this paper, we propose a novel approach that only uses\nlyrics as input to automatically generate a fitting time signature for lyrical\nsongs and uncover the latent rhythmic structure utilizing explainable machine\nlearning models. In particular, we devise multiple methods that are associated\nwith discovering lyrical patterns and creating new features that simultaneously\ncontain lyrical, rhythmic, and statistical information. In this approach, the\nbest of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under\nthe Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In\nconclusion, our research directly generates time signatures from lyrics\nautomatically for new scores utilizing machine learning, which is an innovative\nidea that approaches an understudied component of musicology and therefore\ncontributes significantly to the future of Artificial Intelligence (AI) music\ngeneration.\n","authors":["Callie C. Liao","Duoduo Liao","Jesse Guessford"],"pdf_url":"https://arxiv.org/pdf/2311.15480v1.pdf","comment":"Submitted to IEEE Big Data 2023 Conference"},{"id":"http://arxiv.org/abs/2305.11853v3","updated":"2023-11-27T00:42:07Z","published":"2023-05-19T17:43:58Z","title":"How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain,\n and Cross-domain Settings","summary":" Large language models (LLMs) with in-context learning have demonstrated\nremarkable capability in the text-to-SQL task. Previous research has prompted\nLLMs with various demonstration-retrieval strategies and intermediate reasoning\nsteps to enhance the performance of LLMs. However, those works often employ\nvaried strategies when constructing the prompt text for text-to-SQL inputs,\nsuch as databases and demonstration examples. This leads to a lack of\ncomparability in both the prompt constructions and their primary contributions.\nFurthermore, selecting an effective prompt construction has emerged as a\npersistent problem for future research. To address this limitation, we\ncomprehensively investigate the impact of prompt constructions across various\nsettings and provide insights into prompt constructions for future text-to-SQL\nstudies.\n","authors":["Shuaichen Chang","Eric Fosler-Lussier"],"pdf_url":"https://arxiv.org/pdf/2305.11853v3.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2311.16103v1","updated":"2023-11-27T18:59:58Z","published":"2023-11-27T18:59:58Z","title":"Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating\n Video-based Large Language Models","summary":" Video-based large language models (Video-LLMs) have been recently introduced,\ntargeting both fundamental improvements in perception and comprehension, and a\ndiverse range of user inquiries. In pursuit of the ultimate goal of achieving\nartificial general intelligence, a truly intelligent Video-LLM model should not\nonly see and understand the surroundings, but also possess human-level\ncommonsense, and make well-informed decisions for the users. To guide the\ndevelopment of such a model, the establishment of a robust and comprehensive\nevaluation system becomes crucial. To this end, this paper proposes\n\\textit{Video-Bench}, a new comprehensive benchmark along with a toolkit\nspecifically designed for evaluating Video-LLMs. The benchmark comprises 10\nmeticulously crafted tasks, evaluating the capabilities of Video-LLMs across\nthree distinct levels: Video-exclusive Understanding, Prior Knowledge-based\nQuestion-Answering, and Comprehension and Decision-making. In addition, we\nintroduce an automatic toolkit tailored to process model outputs for various\ntasks, facilitating the calculation of metrics and generating convenient final\nscores. We evaluate 8 representative Video-LLMs using \\textit{Video-Bench}. The\nfindings reveal that current Video-LLMs still fall considerably short of\nachieving human-like comprehension and analysis of real-world videos, offering\nvaluable insights for future research directions. The benchmark and toolkit are\navailable at: \\url{https://github.com/PKU-YuanGroup/Video-Bench}.\n","authors":["Munan Ning","Bin Zhu","Yujia Xie","Bin Lin","Jiaxi Cui","Lu Yuan","Dongdong Chen","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2311.16103v1.pdf","comment":"Benchmark is available at\n https://github.com/PKU-YuanGroup/Video-Bench"},{"id":"http://arxiv.org/abs/2311.16102v1","updated":"2023-11-27T18:59:53Z","published":"2023-11-27T18:59:53Z","title":"Test-time Adaptation of Discriminative Models via Diffusion Generative\n Feedback","summary":" The advancements in generative modeling, particularly the advent of diffusion\nmodels, have sparked a fundamental question: how can these models be\neffectively used for discriminative tasks? In this work, we find that\ngenerative models can be great test-time adapters for discriminative models.\nOur method, Diffusion-TTA, adapts pre-trained discriminative models such as\nimage classifiers, segmenters and depth predictors, to each unlabelled example\nin the test set using generative feedback from a diffusion model. We achieve\nthis by modulating the conditioning of the diffusion model using the output of\nthe discriminative model. We then maximize the image likelihood objective by\nbackpropagating the gradients to discriminative model's parameters. We show\nDiffusion-TTA significantly enhances the accuracy of various large-scale\npre-trained discriminative models, such as, ImageNet classifiers, CLIP models,\nimage pixel labellers and image depth predictors. Diffusion-TTA outperforms\nexisting test-time adaptation methods, including TTT-MAE and TENT, and\nparticularly shines in online adaptation setups, where the discriminative model\nis continually adapted to each example in the test set. We provide access to\ncode, results, and visualizations on our website:\nhttps://diffusion-tta.github.io/.\n","authors":["Mihir Prabhudesai","Tsung-Wei Ke","Alexander C. Li","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2311.16102v1.pdf","comment":"Accepted at NeurIPS 2023 Webpage with Code:\n https://diffusion-tta.github.io/"},{"id":"http://arxiv.org/abs/2311.16101v1","updated":"2023-11-27T18:59:42Z","published":"2023-11-27T18:59:42Z","title":"How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for\n Vision LLMs","summary":" This work focuses on the potential of Vision LLMs (VLLMs) in visual\nreasoning. Different from prior studies, we shift our focus from evaluating\nstandard performance to introducing a comprehensive safety evaluation suite,\ncovering both out-of-distribution (OOD) generalization and adversarial\nrobustness. For the OOD evaluation, we present two novel VQA datasets, each\nwith one variant, designed to test model performance under challenging\nconditions. In exploring adversarial robustness, we propose a straightforward\nattack strategy for misleading VLLMs to produce visual-unrelated responses.\nMoreover, we assess the efficacy of two jailbreaking strategies, targeting\neither the vision or language component of VLLMs. Our evaluation of 21 diverse\nmodels, ranging from open-source VLLMs to GPT-4V, yields interesting\nobservations: 1) Current VLLMs struggle with OOD texts but not images, unless\nthe visual information is limited; and 2) These VLLMs can be easily misled by\ndeceiving vision encoders only, and their vision-language training often\ncompromise safety protocols. We release this safety evaluation suite at\nhttps://github.com/UCSC-VLAA/vllm-safety-benchmark.\n","authors":["Haoqin Tu","Chenhang Cui","Zijun Wang","Yiyang Zhou","Bingchen Zhao","Junlin Han","Wangchunshu Zhou","Huaxiu Yao","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2311.16101v1.pdf","comment":"H.T., C.C., and Z.W. contribute equally. Work done during H.T. and\n Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC"},{"id":"http://arxiv.org/abs/2311.16099v1","updated":"2023-11-27T18:59:30Z","published":"2023-11-27T18:59:30Z","title":"GART: Gaussian Articulated Template Models","summary":" We introduce Gaussian Articulated Template Model GART, an explicit,\nefficient, and expressive representation for non-rigid articulated subject\ncapturing and rendering from monocular videos. GART utilizes a mixture of\nmoving 3D Gaussians to explicitly approximate a deformable subject's geometry\nand appearance. It takes advantage of a categorical template model prior (SMPL,\nSMAL, etc.) with learnable forward skinning while further generalizing to more\ncomplex non-rigid deformations with novel latent bones. GART can be\nreconstructed via differentiable rendering from monocular videos in seconds or\nminutes and rendered in novel poses faster than 150fps.\n","authors":["Jiahui Lei","Yufu Wang","Georgios Pavlakos","Lingjie Liu","Kostas Daniilidis"],"pdf_url":"https://arxiv.org/pdf/2311.16099v1.pdf","comment":"13 pages, code available at\n https://www.cis.upenn.edu/~leijh/projects/gart/"},{"id":"http://arxiv.org/abs/2311.16098v1","updated":"2023-11-27T18:59:25Z","published":"2023-11-27T18:59:25Z","title":"On Bringing Robots Home","summary":" Throughout history, we have successfully integrated various machines into our\nhomes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few\nrecent examples. However, these machines excel at performing only a single task\neffectively. The concept of a \"generalist machine\" in homes - a domestic\nassistant that can adapt and learn from our needs, all while remaining\ncost-effective - has long been a goal in robotics that has been steadily\npursued for decades. In this work, we initiate a large-scale effort towards\nthis goal by introducing Dobb-E, an affordable yet versatile general-purpose\nsystem for learning robotic manipulation within household settings. Dobb-E can\nlearn a new task with only five minutes of a user showing it how to do it,\nthanks to a demonstration collection tool (\"The Stick\") we built out of cheap\nparts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of\nNew York City, and train Home Pretrained Representations (HPR). Then, in a\nnovel home environment, with five minutes of demonstrations and fifteen minutes\nof adapting the HPR model, we show that Dobb-E can reliably solve the task on\nthe Stretch, a mobile robot readily available on the market. Across roughly 30\ndays of experimentation in homes of New York City and surrounding areas, we\ntest our system in 10 homes, with a total of 109 tasks in different\nenvironments, and finally achieve a success rate of 81%. Beyond success\npercentages, our experiments reveal a plethora of unique challenges absent or\nignored in lab robotics. These range from effects of strong shadows, to\nvariable demonstration quality by non-expert users. With the hope of\naccelerating research on home robots, and eventually seeing robot butlers in\nevery home, we open-source Dobb-E software stack and models, our data, and our\nhardware designs at https://dobb-e.com\n","authors":["Nur Muhammad Mahi Shafiullah","Anant Rai","Haritheja Etukuru","Yiqian Liu","Ishan Misra","Soumith Chintala","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2311.16098v1.pdf","comment":"Project website and videos are available at https://dobb-e.com,\n technical documentation for getting started is available at\n https://docs.dobb-e.com, and code is released at\n https://github.com/notmahi/dobb-e"},{"id":"http://arxiv.org/abs/2311.16097v1","updated":"2023-11-27T18:59:10Z","published":"2023-11-27T18:59:10Z","title":"CG-HOI: Contact-Guided 3D Human-Object Interaction Generation","summary":" We propose CG-HOI, the first method to address the task of generating dynamic\n3D human-object interactions (HOIs) from text. We model the motion of both\nhuman and object in an interdependent fashion, as semantically rich human\nmotion rarely happens in isolation without any interactions. Our key insight is\nthat explicitly modeling contact between the human body surface and object\ngeometry can be used as strong proxy guidance, both during training and\ninference. Using this guidance to bridge human and object motion enables\ngenerating more realistic and physically plausible interaction sequences, where\nthe human body and corresponding object move in a coherent manner. Our method\nfirst learns to model human motion, object motion, and contact in a joint\ndiffusion process, inter-correlated through cross-attention. We then leverage\nthis learned contact for guidance during inference synthesis of realistic,\ncoherent HOIs. Extensive evaluation shows that our joint contact-based\nhuman-object interaction approach generates realistic and physically plausible\nsequences, and we show two applications highlighting the capabilities of our\nmethod. Conditioned on a given object trajectory, we can generate the\ncorresponding human motion without re-training, demonstrating strong\nhuman-object interdependency learning. Our approach is also flexible, and can\nbe applied to static real-world 3D scene scans.\n","authors":["Christian Diller","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2311.16097v1.pdf","comment":"Project page: https://cg-hoi.christian-diller.de Video:\n https://www.youtube.com/watch?v=GNyQwTwZ15s"},{"id":"http://arxiv.org/abs/2311.16096v1","updated":"2023-11-27T18:59:04Z","published":"2023-11-27T18:59:04Z","title":"Animatable Gaussians: Learning Pose-dependent Gaussian Maps for\n High-fidelity Human Avatar Modeling","summary":" Modeling animatable human avatars from RGB videos is a long-standing and\nchallenging problem. Recent works usually adopt MLP-based neural radiance\nfields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to\nregress pose-dependent garment details. To this end, we introduce Animatable\nGaussians, a new avatar representation that leverages powerful 2D CNNs and 3D\nGaussian splatting to create high-fidelity avatars. To associate 3D Gaussians\nwith the animatable avatar, we learn a parametric template from the input\nvideos, and then parameterize the template on two front \\& back canonical\nGaussian maps where each pixel represents a 3D Gaussian. The learned template\nis adaptive to the wearing garments for modeling looser clothes like dresses.\nSuch template-guided 2D parameterization enables us to employ a powerful\nStyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling\ndetailed dynamic appearances. Furthermore, we introduce a pose projection\nstrategy for better generalization given novel poses. Overall, our method can\ncreate lifelike avatars with dynamic, realistic and generalized appearances.\nExperiments show that our method outperforms other state-of-the-art approaches.\nCode: https://github.com/lizhe00/AnimatableGaussians\n","authors":["Zhe Li","Zerong Zheng","Lizhen Wang","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16096v1.pdf","comment":"Projectpage: https://animatable-gaussians.github.io/, Code:\n https://github.com/lizhe00/AnimatableGaussians"},{"id":"http://arxiv.org/abs/2311.16094v1","updated":"2023-11-27T18:59:02Z","published":"2023-11-27T18:59:02Z","title":"Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person\n Images","summary":" Virtual try-on has become a popular research topic, but most existing methods\nfocus on studio images with a clean background. They can achieve plausible\nresults for this studio try-on setting by learning to warp a garment image to\nfit a person's body from paired training data, i.e., garment images paired with\nimages of people wearing the same garment. Such data is often collected from\ncommercial websites, where each garment is demonstrated both by itself and on\nseveral models. By contrast, it is hard to collect paired data for in-the-wild\nscenes, and therefore, virtual try-on for casual images of people against\ncluttered backgrounds is rarely studied.\n In this work, we fill the gap in the current virtual try-on research by (1)\nintroducing a Street TryOn benchmark to evaluate performance on street scenes\nand (2) proposing a novel method that can learn without paired data, from a set\nof in-the-wild person images directly. Our method can achieve robust\nperformance across shop and street domains using a novel DensePose warping\ncorrection method combined with diffusion-based inpainting controlled by pose\nand semantic segmentation. Our experiments demonstrate competitive performance\nfor standard studio try-on tasks and SOTA performance for street try-on and\ncross-domain try-on tasks.\n","authors":["Aiyu Cui","Jay Mahajan","Viraj Shah","Preeti Gomathinayagam","Svetlana Lazebnik"],"pdf_url":"https://arxiv.org/pdf/2311.16094v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16091v1","updated":"2023-11-27T18:57:42Z","published":"2023-11-27T18:57:42Z","title":"Interactive Autonomous Navigation with Internal State Inference and\n Interactivity Estimation","summary":" Deep reinforcement learning (DRL) provides a promising way for intelligent\nagents (e.g., autonomous vehicles) to learn to navigate complex scenarios.\nHowever, DRL with neural networks as function approximators is typically\nconsidered a black box with little explainability and often suffers from\nsuboptimal performance, especially for autonomous navigation in highly\ninteractive multi-agent environments. To address these issues, we propose three\nauxiliary tasks with spatio-temporal relational reasoning and integrate them\ninto the standard DRL framework, which improves the decision making performance\nand provides explainable intermediate indicators. We propose to explicitly\ninfer the internal states (i.e., traits and intentions) of surrounding agents\n(e.g., human drivers) as well as to predict their future trajectories in the\nsituations with and without the ego agent through counterfactual reasoning.\nThese auxiliary tasks provide additional supervision signals to infer the\nbehavior patterns of other interactive agents. Multiple variants of framework\nintegration strategies are compared. We also employ a spatio-temporal graph\nneural network to encode relations between dynamic entities, which enhances\nboth internal state inference and decision making of the ego agent. Moreover,\nwe propose an interactivity estimation mechanism based on the difference\nbetween predicted trajectories in these two situations, which indicates the\ndegree of influence of the ego agent on other agents. To validate the proposed\nmethod, we design an intersection driving simulator based on the Intelligent\nIntersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our\napproach achieves robust and state-of-the-art performance in terms of standard\nevaluation metrics and provides explainable intermediate indicators (i.e.,\ninternal states, and interactivity scores) for decision making.\n","authors":["Jiachen Li","David Isele","Kanghoon Lee","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2311.16091v1.pdf","comment":"18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.16090v1","updated":"2023-11-27T18:56:37Z","published":"2023-11-27T18:56:37Z","title":"Self-correcting LLM-controlled Diffusion Models","summary":" Text-to-image generation has witnessed significant progress with the advent\nof diffusion models. Despite the ability to generate photorealistic images,\ncurrent text-to-image diffusion models still often struggle to accurately\ninterpret and follow complex input text prompts. In contrast to existing models\nthat aim to generate images only with their best effort, we introduce\nSelf-correcting LLM-controlled Diffusion (SLD). SLD is a framework that\ngenerates an image from the input prompt, assesses its alignment with the\nprompt, and performs self-corrections on the inaccuracies in the generated\nimage. Steered by an LLM controller, SLD turns text-to-image generation into an\niterative closed-loop process, ensuring correctness in the resulting image. SLD\nis not only training-free but can also be seamlessly integrated with diffusion\nmodels behind API access, such as DALL-E 3, to further boost the performance of\nstate-of-the-art diffusion models. Experimental results show that our approach\ncan rectify a majority of incorrect generations, particularly in generative\nnumeracy, attribute binding, and spatial relationships. Furthermore, by simply\nadjusting the instructions to the LLM, SLD can perform image editing tasks,\nbridging the gap between text-to-image generation and image editing pipelines.\nWe will make our code available for future research and applications.\n","authors":["Tsung-Han Wu","Long Lian","Joseph E. Gonzalez","Boyi Li","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2311.16090v1.pdf","comment":"16 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.16081v1","updated":"2023-11-27T18:52:09Z","published":"2023-11-27T18:52:09Z","title":"ViT-Lens-2: Gateway to Omni-modal Intelligence","summary":" Aiming to advance AI agents, large foundation models significantly improve\nreasoning and instruction execution, yet the current focus on vision and\nlanguage neglects the potential of perceiving diverse modalities in open-world\nenvironments. However, the success of data-driven vision and language models is\ncostly or even infeasible to be reproduced for rare modalities. In this paper,\nwe present ViT-Lens-2 that facilitates efficient omni-modal representation\nlearning by perceiving novel modalities with a pretrained ViT and aligning them\nto a pre-defined space. Specifically, the modality-specific lens is tuned to\nproject any-modal signals to an intermediate embedding space, which are then\nprocessed by a strong ViT with pre-trained visual knowledge. The encoded\nrepresentations are optimized toward aligning with the modal-independent space,\npre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified\nsolution for representation learning of increasing modalities with two\nappealing advantages: (i) Unlocking the great potential of pretrained ViTs to\nnovel modalities effectively with efficient data regime; (ii) Enabling emergent\ndownstream capabilities through modality alignment and shared ViT parameters.\nWe tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio,\ntactile and EEG, and set new state-of-the-art results across various\nunderstanding tasks, such as zero-shot classification. By seamlessly\nintegrating ViT-Lens-2 into Multimodal Foundation Models, we enable\nAny-modality to Text and Image Generation in a zero-shot manner. Code and\nmodels are available at https://github.com/TencentARC/ViT-Lens.\n","authors":["Weixian Lei","Yixiao Ge","Kun Yi","Jianfeng Zhang","Difei Gao","Dylan Sun","Yuying Ge","Ying Shan","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2311.16081v1.pdf","comment":"This work is a follow-up of \"ViT-Lens: Towards Omni-modal\n Representations\". arXiv admin note: text overlap with arXiv:2308.10185"},{"id":"http://arxiv.org/abs/2211.14309v2","updated":"2023-11-27T18:48:33Z","published":"2022-11-25T18:59:53Z","title":"FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from\n Video Observations","summary":" We present a generative approach to forecast long-term future human behavior\nin 3D, requiring only weak supervision from readily available 2D human action\ndata. This is a fundamental task enabling many downstream applications. The\nrequired ground-truth data is hard to capture in 3D (mocap suits, expensive\nsetups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our\nmethod to only require 2D RGB data while being able to generate 3D human motion\nsequences. We use a differentiable 2D projection scheme in an autoregressive\nmanner for weak supervision, and an adversarial loss for 3D regularization. Our\nmethod predicts long and complex behavior sequences (e.g. cooking, assembly)\nconsisting of multiple sub-actions. We tackle this in a semantically\nhierarchical manner, jointly predicting high-level coarse action labels\ntogether with their low-level fine-grained realizations as characteristic 3D\nhuman poses. We observe that these two action representations are coupled in\nnature, and joint prediction benefits both action and pose forecasting. Our\nexperiments demonstrate the complementary nature of joint action and 3D pose\nprediction: our joint approach outperforms each task treated individually,\nenables robust longer-term sequence prediction, and outperforms alternative\napproaches to forecast actions and characteristic 3D poses.\n","authors":["Christian Diller","Thomas Funkhouser","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2211.14309v2.pdf","comment":"Project Page: https://future-human-3d.christian-diller.de/ Video:\n https://www.youtube.com/watch?v=18du85YFXL0"},{"id":"http://arxiv.org/abs/2210.06462v3","updated":"2023-11-27T18:30:14Z","published":"2022-10-12T17:57:58Z","title":"Self-Guided Diffusion Models","summary":" Diffusion models have demonstrated remarkable progress in image generation\nquality, especially when guidance is used to control the generative process.\nHowever, guidance requires a large amount of image-annotation pairs for\ntraining and is thus dependent on their availability, correctness and\nunbiasedness. In this paper, we eliminate the need for such annotation by\ninstead leveraging the flexibility of self-supervision signals to design a\nframework for self-guided diffusion models. By leveraging a feature extraction\nfunction and a self-annotation function, our method provides guidance signals\nat various image granularities: from the level of holistic images to object\nboxes and even segmentation masks. Our experiments on single-label and\nmulti-label image datasets demonstrate that self-labeled guidance always\noutperforms diffusion models without guidance and may even surpass guidance\nbased on ground-truth labels, especially on unbalanced data. When equipped with\nself-supervised box or mask proposals, our method further generates visually\ndiverse yet semantically consistent images, without the need for any class,\nbox, or segment label annotation. Self-guided diffusion is simple, flexible and\nexpected to profit from deployment at scale. Source code will be at:\nhttps://taohu.me/sgdm/\n","authors":["Vincent Tao Hu","David W Zhang","Yuki M. Asano","Gertjan J. Burghouts","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2210.06462v3.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2311.16060v1","updated":"2023-11-27T18:26:19Z","published":"2023-11-27T18:26:19Z","title":"DiffSLVA: Harnessing Diffusion Models for Sign Language Video\n Anonymization","summary":" Since American Sign Language (ASL) has no standard written form, Deaf signers\nfrequently share videos in order to communicate in their native language.\nHowever, since both hands and face convey critical linguistic information in\nsigned languages, sign language videos cannot preserve signer privacy. While\nsigners have expressed interest, for a variety of applications, in sign\nlanguage video anonymization that would effectively preserve linguistic\ncontent, attempts to develop such technology have had limited success, given\nthe complexity of hand movements and facial expressions. Existing approaches\nrely predominantly on precise pose estimations of the signer in video footage\nand often require sign language video datasets for training. These requirements\nprevent them from processing videos 'in the wild,' in part because of the\nlimited diversity present in current sign language video datasets. To address\nthese limitations, our research introduces DiffSLVA, a novel methodology that\nutilizes pre-trained large-scale diffusion models for zero-shot text-guided\nsign language video anonymization. We incorporate ControlNet, which leverages\nlow-level image features such as HED (Holistically-Nested Edge Detection)\nedges, to circumvent the need for pose estimation. Additionally, we develop a\nspecialized module dedicated to capturing facial expressions, which are\ncritical for conveying essential linguistic information in signed languages. We\nthen combine the above methods to achieve anonymization that better preserves\nthe essential linguistic content of the original signer. This innovative\nmethodology makes possible, for the first time, sign language video\nanonymization that could be used for real-world applications, which would offer\nsignificant benefits to the Deaf and Hard-of-Hearing communities. We\ndemonstrate the effectiveness of our approach with a series of signer\nanonymization experiments.\n","authors":["Zhaoyang Xia","Carol Neidle","Dimitris N. Metaxas"],"pdf_url":"https://arxiv.org/pdf/2311.16060v1.pdf","comment":"Project webpage: https://github.com/Jeffery9707/DiffSLVA"},{"id":"http://arxiv.org/abs/2311.16052v1","updated":"2023-11-27T18:14:03Z","published":"2023-11-27T18:14:03Z","title":"Exploring Attribute Variations in Style-based GANs using Diffusion\n Models","summary":" Existing attribute editing methods treat semantic attributes as binary,\nresulting in a single edit per attribute. However, attributes such as\neyeglasses, smiles, or hairstyles exhibit a vast range of diversity. In this\nwork, we formulate the task of \\textit{diverse attribute editing} by modeling\nthe multidimensional nature of attribute edits. This enables users to generate\nmultiple plausible edits per attribute. We capitalize on disentangled latent\nspaces of pretrained GANs and train a Denoising Diffusion Probabilistic Model\n(DDPM) to learn the latent distribution for diverse edits. Specifically, we\ntrain DDPM over a dataset of edit latent directions obtained by embedding image\npairs with a single attribute change. This leads to latent subspaces that\nenable diverse attribute editing. Applying diffusion in the highly compressed\nlatent space allows us to model rich distributions of edits within limited\ncomputational resources. Through extensive qualitative and quantitative\nexperiments conducted across a range of datasets, we demonstrate the\neffectiveness of our approach for diverse attribute editing. We also showcase\nthe results of our method applied for 3D editing of various face attributes.\n","authors":["Rishubh Parihar","Prasanna Balaji","Raghav Magazine","Sarthak Vora","Tejan Karmali","Varun Jampani","R. Venkatesh Babu"],"pdf_url":"https://arxiv.org/pdf/2311.16052v1.pdf","comment":"Neurips Workshop on Diffusion Models 2023"},{"id":"http://arxiv.org/abs/2311.16043v1","updated":"2023-11-27T18:07:58Z","published":"2023-11-27T18:07:58Z","title":"Relightable 3D Gaussian: Real-time Point Cloud Relighting with BRDF\n Decomposition and Ray Tracing","summary":" We present a novel differentiable point-based rendering framework for\nmaterial and lighting decomposition from multi-view images, enabling editing,\nray-tracing, and real-time relighting of the 3D point cloud. Specifically, a 3D\nscene is represented as a set of relightable 3D Gaussian points, where each\npoint is additionally associated with a normal direction, BRDF parameters, and\nincident lights from different directions. To achieve robust lighting\nestimation, we further divide incident lights of each point into global and\nlocal components, as well as view-dependent visibilities. The 3D scene is\noptimized through the 3D Gaussian Splatting technique while BRDF and lighting\nare decomposed by physically-based differentiable rendering. Moreover, we\nintroduce an innovative point-based ray-tracing approach based on the bounding\nvolume hierarchy for efficient visibility baking, enabling real-time rendering\nand relighting of 3D Gaussian points with accurate shadow effects. Extensive\nexperiments demonstrate improved BRDF estimation and novel view rendering\nresults compared to state-of-the-art material estimation approaches. Our\nframework showcases the potential to revolutionize the mesh-based graphics\npipeline with a relightable, traceable, and editable rendering pipeline solely\nbased on point cloud. Project\npage:https://nju-3dv.github.io/projects/Relightable3DGaussian/.\n","authors":["Jian Gao","Chun Gu","Youtian Lin","Hao Zhu","Xun Cao","Li Zhang","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2311.16043v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16042v1","updated":"2023-11-27T18:06:35Z","published":"2023-11-27T18:06:35Z","title":"Weakly-Supervised 3D Reconstruction of Clothed Humans via Normal Maps","summary":" We present a novel deep learning-based approach to the 3D reconstruction of\nclothed humans using weak supervision via 2D normal maps. Given a single RGB\nimage or multiview images, our network infers a signed distance function (SDF)\ndiscretized on a tetrahedral mesh surrounding the body in a rest pose.\nSubsequently, inferred pose and camera parameters are used to generate a normal\nmap from the SDF. A key aspect of our approach is the use of Marching\nTetrahedra to (uniquely) compute a triangulated surface from the SDF on the\ntetrahedral mesh, facilitating straightforward differentiation (and thus\nbackpropagation). Thus, given only ground truth normal maps (with no volumetric\ninformation ground truth information), we can train the network to produce SDF\nvalues from corresponding RGB images. Optionally, an additional multiview loss\nleads to improved results. We demonstrate the efficacy of our approach for both\nnetwork inference and 3D reconstruction.\n","authors":["Jane Wu","Diego Thomas","Ronald Fedkiw"],"pdf_url":"https://arxiv.org/pdf/2311.16042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16038v1","updated":"2023-11-27T17:59:41Z","published":"2023-11-27T17:59:41Z","title":"OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving","summary":" Understanding how the 3D scene evolves is vital for making decisions in\nautonomous driving. Most existing methods achieve this by predicting the\nmovements of object boxes, which cannot capture more fine-grained scene\ninformation. In this paper, we explore a new framework of learning a world\nmodel, OccWorld, in the 3D Occupancy space to simultaneously predict the\nmovement of the ego car and the evolution of the surrounding scenes. We propose\nto learn a world model based on 3D occupancy rather than 3D bounding boxes and\nsegmentation maps for three reasons: 1) expressiveness. 3D occupancy can\ndescribe the more fine-grained 3D structure of the scene; 2) efficiency. 3D\noccupancy is more economical to obtain (e.g., from sparse LiDAR points). 3)\nversatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the\nmodeling of the world evolution, we learn a reconstruction-based scene\ntokenizer on the 3D occupancy to obtain discrete scene tokens to describe the\nsurrounding scenes. We then adopt a GPT-like spatial-temporal generative\ntransformer to generate subsequent scene and ego tokens to decode the future\noccupancy and ego trajectory. Extensive experiments on the widely used nuScenes\nbenchmark demonstrate the ability of OccWorld to effectively model the\nevolution of the driving scenes. OccWorld also produces competitive planning\nresults without using instance and map supervision. Code:\nhttps://github.com/wzzheng/OccWorld.\n","authors":["Wenzhao Zheng","Weiliang Chen","Yuanhui Huang","Borui Zhang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.16038v1.pdf","comment":"Code is available at: https://github.com/wzzheng/OccWorld"},{"id":"http://arxiv.org/abs/2311.16037v1","updated":"2023-11-27T17:58:21Z","published":"2023-11-27T17:58:21Z","title":"GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions","summary":" Recently, impressive results have been achieved in 3D scene editing with text\ninstructions based on a 2D diffusion model. However, current diffusion models\nprimarily generate images by predicting noise in the latent space, and the\nediting is usually applied to the whole image, which makes it challenging to\nperform delicate, especially localized, editing for 3D scenes. Inspired by\nrecent 3D Gaussian splatting, we propose a systematic framework, named\nGaussianEditor, to edit 3D scenes delicately via 3D Gaussians with text\ninstructions. Benefiting from the explicit property of 3D Gaussians, we design\na series of techniques to achieve delicate editing. Specifically, we first\nextract the region of interest (RoI) corresponding to the text instruction,\naligning it to 3D Gaussians. The Gaussian RoI is further used to control the\nediting process. Our framework can achieve more delicate and precise editing of\n3D scenes than previous methods while enjoying much faster training speed, i.e.\nwithin 20 minutes on a single V100 GPU, more than twice as fast as\nInstruct-NeRF2NeRF (45 minutes -- 2 hours).\n","authors":["Jiemin Fang","Junjie Wang","Xiaopeng Zhang","Lingxi Xie","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2311.16037v1.pdf","comment":"Project page: https://GaussianEditor.github.io"},{"id":"http://arxiv.org/abs/2310.06627v2","updated":"2023-11-27T16:59:39Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40\\% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09257v3","updated":"2023-11-27T16:51:40Z","published":"2023-11-14T23:07:50Z","title":"UFOGen: You Forward Once Large Scale Text-to-Image Generation via\n Diffusion GANs","summary":" Text-to-image diffusion models have demonstrated remarkable capabilities in\ntransforming textual prompts into coherent images, yet the computational cost\nof their inference remains a persistent challenge. To address this issue, we\npresent UFOGen, a novel generative model designed for ultra-fast, one-step\ntext-to-image synthesis. In contrast to conventional approaches that focus on\nimproving samplers or employing distillation techniques for diffusion models,\nUFOGen adopts a hybrid methodology, integrating diffusion models with a GAN\nobjective. Leveraging a newly introduced diffusion-GAN objective and\ninitialization with pre-trained diffusion models, UFOGen excels in efficiently\ngenerating high-quality images conditioned on textual descriptions in a single\nstep. Beyond traditional text-to-image generation, UFOGen showcases versatility\nin applications. Notably, UFOGen stands among the pioneering models enabling\none-step text-to-image generation and diverse downstream tasks, presenting a\nsignificant advancement in the landscape of efficient generative models.\n","authors":["Yanwu Xu","Yang Zhao","Zhisheng Xiao","Tingbo Hou"],"pdf_url":"https://arxiv.org/pdf/2311.09257v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16001v1","updated":"2023-11-27T16:47:09Z","published":"2023-11-27T16:47:09Z","title":"Automated Measurement of Vascular Calcification in Femoral\n Endarterectomy Patients Using Deep Learning","summary":" Atherosclerosis, a chronic inflammatory disease affecting the large arteries,\npresents a global health risk. Accurate analysis of diagnostic images, like\ncomputed tomographic angiograms (CTAs), is essential for staging and monitoring\nthe progression of atherosclerosis-related conditions, including peripheral\narterial disease (PAD). However, manual analysis of CTA images is\ntime-consuming and tedious. To address this limitation, we employed a deep\nlearning model to segment the vascular system in CTA images of PAD patients\nundergoing femoral endarterectomy surgery and to measure vascular calcification\nfrom the left renal artery to the patella. Utilizing proprietary CTA images of\n27 patients undergoing femoral endarterectomy surgery provided by Prisma Health\nMidlands, we developed a Deep Neural Network (DNN) model to first segment the\narterial system, starting from the descending aorta to the patella, and second,\nto provide a metric of arterial calcification. Our designed DNN achieved 83.4%\naverage Dice accuracy in segmenting arteries from aorta to patella, advancing\nthe state-of-the-art by 0.8%. Furthermore, our work is the first to present a\nrobust statistical analysis of automated calcification measurement in the lower\nextremities using deep learning, attaining a Mean Absolute Percentage Error\n(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and\nmanual calcification scores. These findings underscore the potential of deep\nlearning techniques as a rapid and accurate tool for medical professionals to\nassess calcification in the abdominal aorta and its branches above the patella.\nThe developed DNN model and related documentation in this project are available\nat GitHub page at https://github.com/pip-alireza/DeepCalcScoring.\n","authors":["Alireza Bagheri Rajeoni","Breanna Pederson","Daniel G. Clair","Susan M. Lessner","Homayoun Valafar"],"pdf_url":"https://arxiv.org/pdf/2311.16001v1.pdf","comment":"Published in MDPI Diagnostic journal, the code can be accessed via\n the GitHub link in the paper"},{"id":"http://arxiv.org/abs/2310.10541v2","updated":"2023-11-27T16:45:18Z","published":"2023-10-16T16:13:53Z","title":"AST: Effective Dataset Distillation through Alignment with Smooth and\n High-Quality Expert Trajectories","summary":" Training large AI models typically requires large-scale datasets in the\nmachine learning process, making training and parameter-tuning process both\ntime-consuming and costly. Some researchers address this problem by carefully\nsynthesizing a very small number of highly representative and informative\nsamples from real-world datasets. This approach, known as Dataset Distillation\n(DD), proposes a perspective for data-efficient learning. Despite recent\nprogress in this field, the performance of existing methods still cannot meet\nexpectations, and distilled datasets cannot effectively replace original\ndatasets. In this paper, unlike previous methods that focus solely on improving\nthe effectiveness of student distillation, we recognize and leverage the\nimportant mutual influence between expert and student models. We observed that\nthe smoothness of expert trajectories has a significant impact on subsequent\nstudent parameter alignment. Based on this, we propose an effective DD\nframework named AST, standing for Alignment with Smooth and high-quality expert\nTrajectories. We devise the integration of clipping loss and gradient penalty\nto regulate the rate of parameter changes in expert trajectory generation. To\nfurther refine the student parameter alignment with expert trajectory, we put\nforward representative initialization for the synthetic dataset and balanced\ninner-loop loss in response to the sensitivity exhibited towards randomly\ninitialized variables during distillation. We also propose two enhancement\nstrategies, namely intermediate matching loss and weight perturbation, to\nmitigate the potential occurrence of cumulative errors. We conduct extensive\nexperiments on datasets of different scales, sizes, and resolutions. The\nresults demonstrate that the proposed method significantly outperforms prior\nmethods.\n","authors":["Jiyuan Shen","Wenzhuo Yang","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2310.10541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15994v1","updated":"2023-11-27T16:43:37Z","published":"2023-11-27T16:43:37Z","title":"Adversaral Doodles: Interpretable and Human-drawable Attacks Provide\n Describable Insights","summary":" DNN-based image classification models are susceptible to adversarial attacks.\nMost previous adversarial attacks do not focus on the interpretability of the\ngenerated adversarial examples, and we cannot gain insights into the mechanism\nof the target classifier from the attacks. Therefore, we propose Adversarial\nDoodles, which have interpretable shapes. We optimize black b\\'ezier curves to\nfool the target classifier by overlaying them onto the input image. By\nintroducing random perspective transformation and regularizing the doodled\narea, we obtain compact attacks that cause misclassification even when humans\nreplicate them by hand. Adversarial doodles provide describable and intriguing\ninsights into the relationship between our attacks and the classifier's output.\nWe utilize adversarial doodles and discover the bias inherent in the target\nclassifier, such as \"We add two strokes on its head, a triangle onto its body,\nand two lines inside the triangle on a bird image. Then, the classifier\nmisclassifies the image as a butterfly.\"\n","authors":["Ryoya Nara","Yusuke Matsui"],"pdf_url":"https://arxiv.org/pdf/2311.15994v1.pdf","comment":"Submitted to CVPR 2024"},{"id":"http://arxiv.org/abs/2311.15993v1","updated":"2023-11-27T16:41:31Z","published":"2023-11-27T16:41:31Z","title":"Unified Batch Normalization: Identifying and Alleviating the Feature\n Condensation in Batch Normalization and a Unified Framework","summary":" Batch Normalization (BN) has become an essential technique in contemporary\nneural network design, enhancing training stability. Specifically, BN employs\ncentering and scaling operations to standardize features along the batch\ndimension and uses an affine transformation to recover features. Although\nstandard BN has shown its capability to improve deep neural network training\nand convergence, it still exhibits inherent limitations in certain cases. Most\nexisting techniques that enhance BN consider a single or a few aspects of BN.\nIn this paper, we first identify problems with BN from a feature perspective\nand explore that feature condensation exists in the learning when employing BN,\nwhich negatively affects testing performance. To tackle this problem, we\npropose a two-stage unified framework called Unified Batch Normalization (UBN).\nIn the first stage, we utilize a simple feature condensation threshold to\nalleviate the feature condensation, which hinders inappropriate statistic\nupdates in normalization. In the second stage, we unify various normalization\nvariants to boost each component of BN. Our experimental results reveal that\nUBN significantly enhances performance across different visual backbones and\nnotably expedites network training convergence, particularly in early training\nstages. Notably, our method improved about 3% in top-1 accuracy on ImageNet\nclassification with large batch sizes, showing the effectiveness of our\napproach in real-world scenarios.\n","authors":["Shaobo Wang","Xiangdong Zhang","Junchi Yan"],"pdf_url":"https://arxiv.org/pdf/2311.15993v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15991v1","updated":"2023-11-27T16:40:09Z","published":"2023-11-27T16:40:09Z","title":"DiffAnt: Diffusion Models for Action Anticipation","summary":" Anticipating future actions is inherently uncertain. Given an observed video\nsegment containing ongoing actions, multiple subsequent actions can plausibly\nfollow. This uncertainty becomes even larger when predicting far into the\nfuture. However, the majority of existing action anticipation models adhere to\na deterministic approach, neglecting to account for future uncertainties. In\nthis work, we rethink action anticipation from a generative view, employing\ndiffusion models to capture different possible future actions. In this\nframework, future actions are iteratively generated from standard Gaussian\nnoise in the latent space, conditioned on the observed video, and subsequently\ntransitioned into the action space. Extensive experiments on four benchmark\ndatasets, i.e., Breakfast, 50Salads, EpicKitchens, and EGTEA Gaze+, are\nperformed and the proposed method achieves superior or comparable results to\nstate-of-the-art methods, showing the effectiveness of a generative approach\nfor action anticipation. Our code and trained models will be published on\nGitHub.\n","authors":["Zeyun Zhong","Chengzhi Wu","Manuel Martin","Michael Voit","Juergen Gall","Jürgen Beyerer"],"pdf_url":"https://arxiv.org/pdf/2311.15991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12144v4","updated":"2023-11-27T16:38:44Z","published":"2023-11-20T19:45:27Z","title":"Applications of Large Scale Foundation Models for Autonomous Driving","summary":" Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007,\nautonomous driving has been the most active field of AI applications. Recently\npowered by large language models (LLMs), chat systems, such as chatGPT and\nPaLM, emerge and rapidly become a promising direction to achieve artificial\ngeneral intelligence (AGI) in natural language processing (NLP). There comes a\nnatural thinking that we could employ these abilities to reformulate autonomous\ndriving. By combining LLM with foundation models, it is possible to utilize the\nhuman knowledge, commonsense and reasoning to rebuild autonomous driving\nsystems from the current long-tailed AI dilemma. In this paper, we investigate\nthe techniques of foundation models and LLMs applied for autonomous driving,\ncategorized as simulation, world model, data annotation and planning or E2E\nsolutions etc.\n","authors":["Yu Huang","Yue Chen","Zhu Li"],"pdf_url":"https://arxiv.org/pdf/2311.12144v4.pdf","comment":"22 pages. arXiv admin note: text overlap with arXiv:2304.03589 by\n other authors"},{"id":"http://arxiv.org/abs/2311.15980v1","updated":"2023-11-27T16:26:54Z","published":"2023-11-27T16:26:54Z","title":"Direct2.5: Diverse Text-to-3D Generation via Multi-view 2.5D Diffusion","summary":" Recent advances in generative AI have unveiled significant potential for the\ncreation of 3D content. However, current methods either apply a pre-trained 2D\ndiffusion model with the time-consuming score distillation sampling (SDS), or a\ndirect 3D diffusion model trained on limited 3D data losing generation\ndiversity. In this work, we approach the problem by employing a multi-view 2.5D\ndiffusion fine-tuned from a pre-trained 2D diffusion model. The multi-view 2.5D\ndiffusion directly models the structural distribution of 3D data, while still\nmaintaining the strong generalization ability of the original 2D diffusion\nmodel, filling the gap between 2D diffusion-based and direct 3D diffusion-based\nmethods for 3D content generation. During inference, multi-view normal maps are\ngenerated using the 2.5D diffusion, and a novel differentiable rasterization\nscheme is introduced to fuse the almost consistent multi-view normal maps into\na consistent 3D model. We further design a normal-conditioned multi-view image\ngeneration module for fast appearance generation given the 3D geometry. Our\nmethod is a one-pass diffusion process and does not require any SDS\noptimization as post-processing. We demonstrate through extensive experiments\nthat, our direct 2.5D generation with the specially-designed fusion scheme can\nachieve diverse, mode-seeking-free, and high-fidelity 3D content generation in\nonly 10 seconds. Project page: https://nju-3dv.github.io/projects/direct25.\n","authors":["Yuanxun Lu","Jingyang Zhang","Shiwei Li","Tian Fang","David McKinnon","Yanghai Tsin","Long Quan","Xun Cao","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2311.15980v1.pdf","comment":"Project webpage: https://nju-3dv.github.io/projects/direct25"},{"id":"http://arxiv.org/abs/2304.00553v3","updated":"2023-11-27T16:24:59Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" As a vital step toward the intelligent agent, Action understanding matters\nfor intelligent agents and has attracted long-term attention. It can be formed\nas the mapping from the action physical space to the semantic space. Typically,\nresearchers built action datasets according to idiosyncratic choices to define\nclasses and push the envelope of benchmarks respectively. Thus, datasets are\nincompatible with each other like \"Isolated Islands\" due to semantic gaps and\nvarious class granularities, e.g., do housework in dataset A and wash plate in\ndataset B. We argue that a more principled semantic space is an urgent need to\nconcentrate the community efforts and enable us to use all datasets together to\npursue generalizable action learning. To this end, we design a structured\naction semantic space in view of verb taxonomy hierarchy and covering massive\nactions. By aligning the classes of previous datasets to our semantic space, we\ngather (image/video/skeleton/MoCap) datasets into a unified database in a\nunified label system, i.e., bridging ``isolated islands'' into a \"Pangea\".\nAccordingly, we propose a novel model mapping from the physical space to\nsemantic space to fully use Pangea. In extensive experiments, our new system\nshows significant superiority, especially in transfer learning. Code and data\nwill be made publicly available.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v3.pdf","comment":"Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2311.15977v1","updated":"2023-11-27T16:23:01Z","published":"2023-11-27T16:23:01Z","title":"Text2Loc: 3D Point Cloud Localization from Natural Language","summary":" We tackle the problem of 3D point cloud localization based on a few natural\nlinguistic descriptions and introduce a novel neural network, Text2Loc, that\nfully interprets the semantic relationship between points and text. Text2Loc\nfollows a coarse-to-fine localization pipeline: text-submap global place\nrecognition, followed by fine localization. In global place recognition,\nrelational dynamics among each textual hint are captured in a hierarchical\ntransformer with max-pooling (HTM), whereas a balance between positive and\nnegative pairs is maintained using text-submap contrastive learning. Moreover,\nwe propose a novel matching-free fine localization method to further refine the\nlocation predictions, which completely removes the need for complicated\ntext-instance matching and is lighter, faster, and more accurate than previous\nmethods. Extensive experiments show that Text2Loc improves the localization\naccuracy by up to $2\\times$ over the state-of-the-art on the KITTI360Pose\ndataset. We will make the code publicly available.\n","authors":["Yan Xia","Letian Shi","Zifeng Ding","João F. Henriques","Daniel Cremers"],"pdf_url":"https://arxiv.org/pdf/2311.15977v1.pdf","comment":"10 pages, 6 figures, 6 tables"},{"id":"http://arxiv.org/abs/2309.14809v2","updated":"2023-11-27T16:09:03Z","published":"2023-09-26T10:14:44Z","title":"ENIGMA-51: Towards a Fine-Grained Understanding of Human-Object\n Interactions in Industrial Scenarios","summary":" ENIGMA-51 is a new egocentric dataset acquired in an industrial scenario by\n19 subjects who followed instructions to complete the repair of electrical\nboards using industrial tools (e.g., electric screwdriver) and equipments\n(e.g., oscilloscope). The 51 egocentric video sequences are densely annotated\nwith a rich set of labels that enable the systematic study of human behavior in\nthe industrial domain. We provide benchmarks on four tasks related to human\nbehavior: 1) untrimmed temporal detection of human-object interactions, 2)\negocentric human-object interaction detection, 3) short-term object interaction\nanticipation and 4) natural language understanding of intents and entities.\nBaseline results show that the ENIGMA-51 dataset poses a challenging benchmark\nto study human behavior in industrial scenarios. We publicly release the\ndataset at https://iplab.dmi.unict.it/ENIGMA-51.\n","authors":["Francesco Ragusa","Rosario Leonardi","Michele Mazzamuto","Claudia Bonanno","Rosario Scavo","Antonino Furnari","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2309.14809v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15965v1","updated":"2023-11-27T16:07:39Z","published":"2023-11-27T16:07:39Z","title":"FALCON: Fairness Learning via Contrastive Attention Approach to\n Continual Semantic Scene Understanding in Open World","summary":" Continual Learning in semantic scene segmentation aims to continually learn\nnew unseen classes in dynamic environments while maintaining previously learned\nknowledge. Prior studies focused on modeling the catastrophic forgetting and\nbackground shift challenges in continual learning. However, fairness, another\nmajor challenge that causes unfair predictions leading to low performance among\nmajor and minor classes, still needs to be well addressed. In addition, prior\nmethods have yet to model the unknown classes well, thus resulting in producing\nnon-discriminative features among unknown classes. This paper presents a novel\nFairness Learning via Contrastive Attention Approach to continual learning in\nsemantic scene understanding. In particular, we first introduce a new Fairness\nContrastive Clustering loss to address the problems of catastrophic forgetting\nand fairness. Then, we propose an attention-based visual grammar approach to\neffectively model the background shift problem and unknown classes, producing\nbetter feature representations for different unknown classes. Through our\nexperiments, our proposed approach achieves State-of-the-Art (SOTA) performance\non different continual learning settings of three standard benchmarks, i.e.,\nADE20K, Cityscapes, and Pascal VOC. It promotes the fairness of the continual\nsemantic segmentation model.\n","authors":["Thanh-Dat Truong","Utsav Prabhu","Bhiksha Raj","Jackson Cothren","Khoa Luu"],"pdf_url":"https://arxiv.org/pdf/2311.15965v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15964v1","updated":"2023-11-27T16:07:37Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15963v1","updated":"2023-11-27T16:07:34Z","published":"2023-11-27T16:07:34Z","title":"From Pixels to Titles: Video Game Identification by Screenshots using\n Convolutional Neural Networks","summary":" This paper investigates video game identification through single screenshots,\nutilizing five convolutional neural network (CNN) architectures (MobileNet,\nDenseNet, EfficientNetB0, EfficientNetB2, and EfficientNetB3) across 22 home\nconsole systems, spanning from Atari 2600 to PlayStation 5. Confirming the\nhypothesis, CNNs autonomously extract image features, enabling the\nidentification of game titles from screenshots without additional features.\nUsing ImageNet pre-trained weights, EfficientNetB3 achieves the highest average\naccuracy (74.51%), while DenseNet169 excels in 14 of the 22 systems. Employing\nalternative initial weights from another screenshots dataset boosts accuracy\nfor EfficientNetB2 and EfficientNetB3, with the latter reaching a peak accuracy\nof 76.36% and demonstrating reduced convergence epochs from 23.7 to 20.5 on\naverage. Overall, the combination of optimal architecture and weights attains\n77.67% accuracy, primarily led by EfficientNetB3 in 19 systems. These findings\nunderscore the efficacy of CNNs in video game identification through\nscreenshots.\n","authors":["Fabricio Breve"],"pdf_url":"https://arxiv.org/pdf/2311.15963v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10093v2","updated":"2023-11-27T15:58:30Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, these models struggle with generation\nof consistent characters, a crucial aspect for numerous real-world applications\nsuch as story visualization, game development asset design, advertising, and\nmore. Current methods typically rely on multiple pre-existing images of the\ntarget character or involve labor-intensive manual processes. In this work, we\npropose a fully automated solution for consistent character generation, with\nthe sole input being a text prompt. We introduce an iterative procedure that,\nat each stage, identifies a coherent set of images sharing a similar identity\nand extracts a more consistent identity from this set. Our quantitative\nanalysis demonstrates that our method strikes a better balance between prompt\nalignment and identity consistency compared to the baseline methods, and these\nfindings are reinforced by a user study. To conclude, we showcase several\npractical applications of our approach. Project page is available at\nhttps://omriavrahami.com/the-chosen-one\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v2.pdf","comment":"Project page is available at https://omriavrahami.com/the-chosen-one"},{"id":"http://arxiv.org/abs/2311.15941v1","updated":"2023-11-27T15:49:29Z","published":"2023-11-27T15:49:29Z","title":"Tell2Design: A Dataset for Language-Guided Floor Plan Generation","summary":" We consider the task of generating designs directly from natural language\ndescriptions, and consider floor plan generation as the initial research area.\nLanguage conditional generative models have recently been very successful in\ngenerating high-quality artistic images. However, designs must satisfy\ndifferent constraints that are not present in generating artistic images,\nparticularly spatial and relational constraints. We make multiple contributions\nto initiate research on this task. First, we introduce a novel dataset,\n\\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs\nassociated with natural language instructions. Second, we propose a\nSequence-to-Sequence model that can serve as a strong baseline for future\nresearch. Third, we benchmark this task with several text-conditional image\ngeneration models. We conclude by conducting human evaluations on the generated\nsamples and providing an analysis of human performance. We hope our\ncontributions will propel the research on language-guided design generation\nforward.\n","authors":["Sicong Leng","Yang Zhou","Mohammed Haroon Dupty","Wee Sun Lee","Sam Conrad Joyce","Wei Lu"],"pdf_url":"https://arxiv.org/pdf/2311.15941v1.pdf","comment":"Paper published in ACL2023; Area Chair Award; Best Paper Nomination"},{"id":"http://arxiv.org/abs/2311.15939v1","updated":"2023-11-27T15:46:47Z","published":"2023-11-27T15:46:47Z","title":"Unleashing the Power of Prompt-driven Nucleus Instance Segmentation","summary":" Nuclear instance segmentation in histology images is crucial for a broad\nspectrum of clinical applications. Current prevailing nuclear instance\nsegmentation algorithms rely on regression of nuclei contours, distance maps,\nwatershed markers or a proxy nuclear representation of star-convex polygons.\nConsequently, these methods necessitate sophisticated post-processing\noperations to distinguish nuclei instances, which are commonly acknowledged to\nbe error-prone and parameter-sensitive. Recently, the segment anything model\n(SAM) has earned attracted huge attention within the domain of medical image\nsegmentation due to its impressive generalization ability and promptable\nproperty. Nevertheless, its potential on nuclear instance segmentation remains\nlargely underexplored. In this paper, we present a novel prompt-driven\nframework that consists of a point prompter and a SAM for automatic nuclei\ninstance segmentation. Specifically, the prompter learns to generate a unique\npoint prompt for each nucleus while the SAM is fine tuned to output the\ncorresponding mask of the cued nucleus. Furthermore, we propose to add adjacent\nnuclei as negative prompts to promote the model's ability to recognize\noverlapping nuclei. Without bells and whistles, our proposed method sets a new\nstate-of-the-art performance on three challenging benchmarks. Our code is\navailable at\n\\textcolor{magenta}{\\url{https://github.com/windygoo/PromptNucSeg}} .\n","authors":["Zhongyi Shui","Yunlong Zhang","Kai Yao","Chenglu Zhu","Yuxuan Sun","Lin Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15937v1","updated":"2023-11-27T15:46:19Z","published":"2023-11-27T15:46:19Z","title":"Optimal Transport Aggregation for Visual Place Recognition","summary":" The task of Visual Place Recognition (VPR) aims to match a query image\nagainst references from an extensive database of images from different places,\nrelying solely on visual cues. State-of-the-art pipelines focus on the\naggregation of features extracted from a deep backbone, in order to form a\nglobal descriptor for each image. In this context, we introduce SALAD (Sinkhorn\nAlgorithm for Locally Aggregated Descriptors), which reformulates NetVLAD's\nsoft-assignment of local features to clusters as an optimal transport problem.\nIn SALAD, we consider both feature-to-cluster and cluster-to-feature relations\nand we also introduce a 'dustbin' cluster, designed to selectively discard\nfeatures deemed non-informative, enhancing the overall descriptor quality.\nAdditionally, we leverage and fine-tune DINOv2 as a backbone, which provides\nenhanced description power for the local features, and dramatically reduces the\nrequired training time. As a result, our single-stage method not only surpasses\nsingle-stage baselines in public VPR datasets, but also surpasses two-stage\nmethods that add a re-ranking with significantly higher cost. Code and models\nare available at https://github.com/serizba/salad.\n","authors":["Sergio Izquierdo","Javier Civera"],"pdf_url":"https://arxiv.org/pdf/2311.15937v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15916v1","updated":"2023-11-27T15:24:54Z","published":"2023-11-27T15:24:54Z","title":"ADM-Loc: Actionness Distribution Modeling for Point-supervised Temporal\n Action Localization","summary":" This paper addresses the challenge of point-supervised temporal action\ndetection, in which only one frame per action instance is annotated in the\ntraining set. Self-training aims to provide supplementary supervision for the\ntraining process by generating pseudo-labels (action proposals) from a base\nmodel. However, most current methods generate action proposals by applying\nmanually designed thresholds to action classification probabilities and\ntreating adjacent snippets as independent entities. As a result, these methods\nstruggle to generate complete action proposals, exhibit sensitivity to\nfluctuations in action classification scores, and generate redundant and\noverlapping action proposals. This paper proposes a novel framework termed\nADM-Loc, which stands for Actionness Distribution Modeling for point-supervised\naction Localization. ADM-Loc generates action proposals by fitting a composite\ndistribution, comprising both Gaussian and uniform distributions, to the action\nclassification signals. This fitting process is tailored to each action class\npresent in the video and is applied separately for each action instance,\nensuring the distinctiveness of their distributions. ADM-Loc significantly\nenhances the alignment between the generated action proposals and ground-truth\naction instances and offers high-quality pseudo-labels for self-training.\nMoreover, to model action boundary snippets, it enforces consistency in action\nclassification scores during training by employing Gaussian kernels, supervised\nwith the proposed loss functions. ADM-Loc outperforms the state-of-the-art\npoint-supervised methods on THUMOS14 and ActivityNet-v1.2 datasets.\n","authors":["Elahe Vahdani","Yingli Tian"],"pdf_url":"https://arxiv.org/pdf/2311.15916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01908v2","updated":"2023-11-27T15:23:27Z","published":"2023-11-03T13:38:42Z","title":"LLM-driven Multimodal Target Volume Contouring in Radiation Oncology","summary":" Target volume contouring for radiation therapy is considered significantly\nmore challenging than the normal organ segmentation tasks as it necessitates\nthe utilization of both image and text-based clinical information. Inspired by\nthe recent advancement of large language models (LLMs) that can facilitate the\nintegration of the textural information and images, here we present a novel\nLLM-driven multi-modal AI that utilizes the clinical text information and is\napplicable to the challenging task of target volume contouring for radiation\ntherapy, and validate it within the context of breast cancer radiation therapy\ntarget volume contouring. Using external validation and data-insufficient\nenvironments, which attributes highly conducive to real-world applications, we\ndemonstrate that the proposed model exhibits markedly improved performance\ncompared to conventional vision-only AI models, particularly exhibiting robust\ngeneralization performance and data-efficiency. To our best knowledge, this is\nthe first LLM-driven multimodal AI model that integrates the clinical text\ninformation into target volume delineation for radiation oncology.\n","authors":["Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.01908v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15914v1","updated":"2023-11-27T15:23:25Z","published":"2023-11-27T15:23:25Z","title":"Computer Vision for Carriers: PATRIOT","summary":" Deck tracking performed on carriers currently involves a team of sailors\nmanually identifying aircraft and updating a digital user interface called the\nOuija Board. Improvements to the deck tracking process would result in\nincreased Sortie Generation Rates, and therefore applying automation is seen as\na critical method to improve deck tracking. However, the requirements on a\ncarrier ship do not allow for the installation of hardware-based location\nsensing technologies like Global Positioning System (GPS) sensors. PATRIOT\n(Panoramic Asset Tracking of Real-Time Information for the Ouija Tabletop) is a\nresearch effort and proposed solution to performing deck tracking with passive\nsensing and without the need for GPS sensors. PATRIOT is a prototype system\nwhich takes existing camera feeds, calculates aircraft poses, and updates a\nvirtual Ouija board interface with the current status of the assets. PATRIOT\nwould allow for faster, more accurate, and less laborious asset tracking for\naircraft, people, and support equipment. PATRIOT is anticipated to benefit the\nwarfighter by reducing cognitive workload, reducing manning requirements,\ncollecting data to improve logistics, and enabling an automation gateway for\nfuture efforts to improve efficiency and safety. The authors have developed and\ntested algorithms to perform pose estimations of assets in real-time including\nOpenPifPaf, High-Resolution Network (HRNet), HigherHRNet (HHRNet), Faster\nR-CNN, and in-house developed encoder-decoder network. The software was tested\nwith synthetic and real-world data and was able to accurately extract the pose\nof assets. Fusion, tracking, and real-world generality are planned to be\nimproved to ensure a successful transition to the fleet.\n","authors":["Ari Goodman","Gurpreet Singh","James Hing","Ryan O'Shea"],"pdf_url":"https://arxiv.org/pdf/2311.15914v1.pdf","comment":"8 pages, 18 figures. Published in the Proceedings of the ASNE 2023\n Technology, Systems & Ships Symposium. Reproduced with permission from the\n American Society of Naval Engineers. Distribution Statement A: Approved for\n public release; distribution is unlimited, as submitted under NAVAIR Public\n Release Authorization 2023-019"},{"id":"http://arxiv.org/abs/2311.15912v1","updated":"2023-11-27T15:22:17Z","published":"2023-11-27T15:22:17Z","title":"LIFT OFF: LoRaWAN Installation and Fiducial Tracking Operations for the\n Flightline of the Future","summary":" Real-time situational awareness for the location of assets is critical to\nensure missions are completed efficiently and requirements are satisfied. In\nmany commercial settings, the application of global positioning system (GPS)\nsensors is appropriate to achieve timely knowledge of the position of people\nand equipment. However, GPS sensors are not appropriate for all situations due\nto flight clearance and operations security concerns. LIFT OFF: LoRaWAN\nInstallation and Fiducial Tracking Operations for the Flightline of the Future\nproposes a hybrid framework solution to achieve real-time situational awareness\nfor people, support equipment, and aircraft positions regardless of the\nenvironment. This framework included a machine-vision component, which involved\nsetting up cameras to detect AprilTag decals that were installed on the sides\nof aircraft. The framework included a geolocation sensor component, which\ninvolved installing GPS sensors on support equipment and helmets. The framework\nalso included creating a long-range wide area network (LoRaWAN) to transfer\ndata and developing a user interface to display the data. The framework was\ntested at Naval Air Station Oceana Flightline, the United States Naval Test\nPilot School, and at Naval Air Warfare Center Aircraft Division Lakehurst. LIFT\nOFF successfully provided a real-time updating map of all tracked assets using\nGPS sensors for people and support equipment and with visual fiducials for\naircraft. The trajectories of the assets were recorded for logistical analysis\nand playback. Future follow-on work is anticipated to apply the technology to\nother environments including carriers and amphibious assault ships in addition\nto the flightline.\n","authors":["Ari Goodman","Ryan O'Shea"],"pdf_url":"https://arxiv.org/pdf/2311.15912v1.pdf","comment":"6 pages, 11 figures. Published in the Proceedings of the ASNE 2023\n Technology, Systems & Ships Symposium. Reproduced with permission from the\n American Society of Naval Engineers. Distribution Statement A: Approved for\n public release; distribution is unlimited, as submitted under NAVAIR Public\n Release Authorization 2023-020"},{"id":"http://arxiv.org/abs/2209.13204v2","updated":"2023-11-27T15:19:00Z","published":"2022-09-27T07:10:20Z","title":"NEURAL MARIONETTE: A Transformer-based Multi-action Human Motion\n Synthesis System","summary":" We present a neural network-based system for long-term, multi-action human\nmotion synthesis. The system, dubbed as NEURAL MARIONETTE, can produce\nhigh-quality and meaningful motions with smooth transitions from simple user\ninput, including a sequence of action tags with expected action duration, and\noptionally a hand-drawn moving trajectory if the user specifies. The core of\nour system is a novel Transformer-based motion generation model, namely\nMARIONET, which can generate diverse motions given action tags. Different from\nexisting motion generation models, MARIONET utilizes contextual information\nfrom the past motion clip and future action tag, dedicated to generating\nactions that can smoothly blend historical and future actions. Specifically,\nMARIONET first encodes target action tag and contextual information into an\naction-level latent code. The code is unfolded into frame-level control signals\nvia a time unrolling module, which could be then combined with other\nframe-level control signals like the target trajectory. Motion frames are then\ngenerated in an auto-regressive way. By sequentially applying MARIONET, the\nsystem NEURAL MARIONETTE can robustly generate long-term, multi-action motions\nwith the help of two simple schemes, namely \"Shadow Start\" and \"Action\nRevision\". Along with the novel system, we also present a new dataset dedicated\nto the multi-action motion synthesis task, which contains both action tags and\ntheir contextual information. Extensive experiments are conducted to study the\naction accuracy, naturalism, and transition smoothness of the motions generated\nby our system.\n","authors":["Weiqiang Wang","Xuefei Zhe","Qiuhong Ke","Di Kang","Tingguang Li","Ruizhi Chen","Linchao Bao"],"pdf_url":"https://arxiv.org/pdf/2209.13204v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15908v1","updated":"2023-11-27T15:14:38Z","published":"2023-11-27T15:14:38Z","title":"Enhancing Perceptual Quality in Video Super-Resolution through\n Temporally-Consistent Detail Synthesis using Diffusion Models","summary":" In this paper, we address the problem of video super-resolution (VSR) using\nDiffusion Models (DM), and present StableVSR. Our method significantly enhances\nthe perceptual quality of upscaled videos by synthesizing realistic and\ntemporally-consistent details. We turn a pre-trained DM for single image\nsuper-resolution into a VSR method by introducing the Temporal Conditioning\nModule (TCM). TCM uses Temporal Texture Guidance, which provides\nspatially-aligned and detail-rich texture information synthesized in adjacent\nframes. This guides the generative process of the current frame toward\nhigh-quality and temporally-consistent results. We introduce a Frame-wise\nBidirectional Sampling strategy to encourage the use of information from past\nto future and vice-versa. This strategy improves the perceptual quality of the\nresults and the temporal consistency across frames. We demonstrate the\neffectiveness of StableVSR in enhancing the perceptual quality of upscaled\nvideos compared to existing state-of-the-art methods for VSR. The code is\navailable at https://github.com/claudiom4sir/StableVSR.\n","authors":["Claudio Rota","Marco Buzzelli","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2311.15908v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15906v1","updated":"2023-11-27T15:13:02Z","published":"2023-11-27T15:13:02Z","title":"MetaDefa: Meta-learning based on Domain Enhancement and Feature\n Alignment for Single Domain Generalization","summary":" The single domain generalization(SDG) based on meta-learning has emerged as\nan effective technique for solving the domain-shift problem. However, the\ninadequate match of data distribution between source and augmented domains and\ndifficult separation of domain-invariant features from domain-related features\nmake SDG model hard to achieve great generalization. Therefore, a novel\nmeta-learning method based on domain enhancement and feature alignment\n(MetaDefa) is proposed to improve the model generalization performance. First,\nthe background substitution and visual corruptions techniques are used to\ngenerate diverse and effective augmented domains. Then, the multi-channel\nfeature alignment module based on class activation maps and class agnostic\nactivation maps is designed to effectively extract adequate transferability\nknowledge. In this module, domain-invariant features can be fully explored by\nfocusing on similar target regions between source and augmented domains feature\nspace and suppressing the feature representation of non-similar target regions.\nExtensive experiments on two publicly available datasets show that MetaDefa has\nsignificant generalization performance advantages in unknown multiple target\ndomains.\n","authors":["Can Sun","Hao Zheng","Zhigang Hu","Liu Yang","Meiguang Zheng","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15906v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.05697v2","updated":"2023-11-27T15:08:03Z","published":"2023-11-09T19:10:28Z","title":"3DGAUnet: 3D generative adversarial networks with a 3D U-Net based\n generator to achieve the accurate and effective synthesis of clinical tumor\n image data for pancreatic cancer","summary":" Pancreatic ductal adenocarcinoma (PDAC) presents a critical global health\nchallenge, and early detection is crucial for improving the 5-year survival\nrate. Recent medical imaging and computational algorithm advances offer\npotential solutions for early diagnosis. Deep learning, particularly in the\nform of convolutional neural networks (CNNs), has demonstrated success in\nmedical image analysis tasks, including classification and segmentation.\nHowever, the limited availability of clinical data for training purposes\ncontinues to provide a significant obstacle. Data augmentation, generative\nadversarial networks (GANs), and cross-validation are potential techniques to\naddress this limitation and improve model performance, but effective solutions\nare still rare for 3D PDAC, where contrast is especially poor owing to the high\nheterogeneity in both tumor and background tissues. In this study, we developed\na new GAN-based model, named 3DGAUnet, for generating realistic 3D CT images of\nPDAC tumors and pancreatic tissue, which can generate the interslice connection\ndata that the existing 2D CT image synthesis models lack. Our innovation is to\ndevelop a 3D U-Net architecture for the generator to improve shape and texture\nlearning for PDAC tumors and pancreatic tissue. Our approach offers a promising\npath to tackle the urgent requirement for creative and synergistic methods to\ncombat PDAC. The development of this GAN-based model has the potential to\nalleviate data scarcity issues, elevate the quality of synthesized data, and\nthereby facilitate the progression of deep learning models to enhance the\naccuracy and early detection of PDAC tumors, which could profoundly impact\npatient outcomes. Furthermore, this model has the potential to be adapted to\nother types of solid tumors, hence making significant contributions to the\nfield of medical imaging in terms of image processing models.\n","authors":["Yu Shi","Hannah Tang","Michael Baine","Michael A. Hollingsworth","Huijing Du","Dandan Zheng","Chi Zhang","Hongfeng Yu"],"pdf_url":"https://arxiv.org/pdf/2311.05697v2.pdf","comment":"Published on Cancers: Shi, Yu, Hannah Tang, Michael J. Baine, Michael\n A. Hollingsworth, Huijing Du, Dandan Zheng, Chi Zhang, and Hongfeng Yu. 2023.\n \"3DGAUnet: 3D Generative Adversarial Networks with a 3D U-Net Based Generator\n to Achieve the Accurate and Effective Synthesis of Clinical Tumor Image Data\n for Pancreatic Cancer\" Cancers 15, no. 23: 5496"},{"id":"http://arxiv.org/abs/2311.15896v1","updated":"2023-11-27T15:01:26Z","published":"2023-11-27T15:01:26Z","title":"Data Generation for Post-OCR correction of Cyrillic handwriting","summary":" This paper introduces a novel approach to post-Optical Character Recognition\nCorrection (POC) for handwritten Cyrillic text, addressing a significant gap in\ncurrent research methodologies. This gap is due to the lack of large text\ncorporas that provide OCR errors for further training of language-based POC\nmodels, which are demanding in terms of corpora size. Our study primarily\nfocuses on the development and application of a synthetic handwriting\ngeneration engine based on B\\'ezier curves. Such an engine generates highly\nrealistic handwritten text in any amounts, which we utilize to create a\nsubstantial dataset by transforming Russian text corpora sourced from the\ninternet. We apply a Handwritten Text Recognition (HTR) model to this dataset\nto identify OCR errors, forming the basis for our POC model training. The\ncorrection model is trained on a 90-symbol input context, utilizing a\npre-trained T5 architecture with a seq2seq correction task. We evaluate our\napproach on HWR200 and School_notebooks_RU datasets as they provide significant\nchallenges in the HTR domain. Furthermore, POC can be used to highlight errors\nfor teachers, evaluating student performance. This can be done simply by\ncomparing sentences before and after correction, displaying differences in\ntext. Our primary contribution lies in the innovative use of B\\'ezier curves\nfor Cyrillic text generation and subsequent error correction using a\nspecialized POC model. We validate our approach by presenting Word Accuracy\nRate (WAR) and Character Accuracy Rate (CAR) results, both with and without\npost-OCR correction, using real open corporas of handwritten Cyrillic text.\nThese results, coupled with our methodology, are designed to be reproducible,\npaving the way for further advancements in the field of OCR and handwritten\ntext analysis. Paper contributions can be found in\nhttps://github.com/dbrainio/CyrillicHandwritingPOC\n","authors":["Evgenii Davydkin","Aleksandr Markelov","Egor Iuldashev","Anton Dudkin","Ivan Krivorotov"],"pdf_url":"https://arxiv.org/pdf/2311.15896v1.pdf","comment":"17 pages, 27 figures, 6 tables, 26 references"},{"id":"http://arxiv.org/abs/2311.15890v1","updated":"2023-11-27T14:56:47Z","published":"2023-11-27T14:56:47Z","title":"Stability-Informed Initialization of Neural Ordinary Differential\n Equations","summary":" This paper addresses the training of Neural Ordinary Differential Equations\n(neural ODEs), and in particular explores the interplay between numerical\nintegration techniques, stability regions, step size, and initialization\ntechniques. It is shown how the choice of integration technique implicitly\nregularizes the learned model, and how the solver's corresponding stability\nregion affects training and prediction performance. From this analysis, a\nstability-informed parameter initialization technique is introduced. The\neffectiveness of the initialization method is displayed across several learning\nbenchmarks and industrial applications.\n","authors":["Theodor Westny","Arman Mohammadi","Daniel Jung","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2311.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15879v1","updated":"2023-11-27T14:51:37Z","published":"2023-11-27T14:51:37Z","title":"EVCap: Retrieval-Augmented Image Captioning with External Visual-Name\n Memory for Open-World Comprehension","summary":" Large language models (LLMs)-based image captioning has the capability of\ndescribing objects not explicitly observed in training data; yet novel objects\noccur frequently, necessitating the requirement of sustaining up-to-date object\nknowledge for open-world comprehension. Instead of relying on large amounts of\ndata and scaling up network parameters, we introduce a highly effective\nretrieval-augmented image captioning method that prompts LLMs with object names\nretrieved from External Visual--name memory (EVCap). We build ever-changing\nobject knowledge memory using objects' visuals and names, enabling us to (i)\nupdate the memory at a minimal cost and (ii) effortlessly augment LLMs with\nretrieved object names utilizing a lightweight and fast-to-train model. Our\nmodel, which was trained only on the COCO dataset, can be adapted to out-domain\ndata without additional fine-tuning or retraining. Our comprehensive\nexperiments conducted on various benchmarks and synthetic commonsense-violating\ndata demonstrate that EVCap, comprising solely 3.97M trainable parameters,\nexhibits superior performance compared to other methods of equivalent model\nsize scale. Notably, it achieves competitive performance against specialist\nSOTAs with an enormous number of parameters. Our code is available at\nhttps://jiaxuan-li.github.io/EVCap.\n","authors":["Jiaxuan Li","Duc Minh Vo","Akihiro Sugimoto","Hideki Nakayama"],"pdf_url":"https://arxiv.org/pdf/2311.15879v1.pdf","comment":"Project page: https://jiaxuan-li.github.io/EVCap"},{"id":"http://arxiv.org/abs/2311.15876v1","updated":"2023-11-27T14:49:06Z","published":"2023-11-27T14:49:06Z","title":"RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation\n and Consistency Regularization","summary":" Recent advancements in Artificial Intelligence (AI) have profoundly\ninfluenced medical fields, by providing tools to reduce clinical workloads.\nHowever, most AI models are constrained to execute uni-modal tasks, in stark\ncontrast to the comprehensive approaches utilized by medical professionals. To\naddress this, here we present RO-LLaMA, a versatile generalist large language\nmodel (LLM) tailored for the field of radiation oncology. This model seamlessly\ncovers a wide range of the workflow of radiation oncologists, adept at various\ntasks such as clinical report summarization, radiation therapy plan suggestion,\nand plan-guided therapy target volume segmentation. In particular, to maximize\nthe end-to-end performance, we further present a novel Consistency Embedding\nFine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional\nerrors at the intermediates while preserving the capability of handling clean\ninputs, and creatively transform this concept into LLM-driven segmentation\nframework as Consistency Embedding Segmentation (CESEG). Experimental results\non multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising\nperformance for diverse tasks with generalization capabilities.\n","authors":["Kwanyoung Kim","Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Yong Bae Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00349v2","updated":"2023-11-27T14:42:52Z","published":"2023-06-01T05:06:56Z","title":"CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV\n Perception","summary":" Perception is crucial in the realm of autonomous driving systems, where\nbird's eye view (BEV)-based architectures have recently reached\nstate-of-the-art performance. The desirability of self-supervised\nrepresentation learning stems from the expensive and laborious process of\nannotating 2D and 3D data. Although previous research has investigated\npretraining methods for both LiDAR and camera-based 3D object detection, a\nunified pretraining framework for multimodal BEV perception is missing. In this\nstudy, we introduce CALICO, a novel framework that applies contrastive\nobjectives to both LiDAR and camera backbones. Specifically, CALICO\nincorporates two stages: point-region contrast (PRC) and region-aware\ndistillation (RAD). PRC better balances the region- and scene-level\nrepresentation learning on the LiDAR modality and offers significant\nperformance improvement compared to existing methods. RAD effectively achieves\ncontrastive distillation on our self-trained teacher model. CALICO's efficacy\nis substantiated by extensive evaluations on 3D object detection and BEV map\nsegmentation tasks, where it delivers significant performance improvements.\nNotably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and\nmAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection\nagainst adversarial attacks and corruption. Additionally, our framework can be\ntailored to different backbones and heads, positioning it as a promising\napproach for multimodal BEV perception.\n","authors":["Jiachen Sun","Haizhong Zheng","Qingzhao Zhang","Atul Prakash","Z. Morley Mao","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2306.00349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15864v1","updated":"2023-11-27T14:32:33Z","published":"2023-11-27T14:32:33Z","title":"InterControl: Generate Human Motion Interactions by Controlling Every\n Joint","summary":" Text-conditioned human motion generation model has achieved great progress by\nintroducing diffusion models and corresponding control signals. However, the\ninteraction between humans are still under explored. To model interactions of\narbitrary number of humans, we define interactions as human joint pairs that\nare either in contact or separated, and leverage {\\em Large Language Model\n(LLM) Planner} to translate interaction descriptions into contact plans. Based\non the contact plans, interaction generation could be achieved by spatially\ncontrollable motion generation methods by taking joint contacts as spatial\nconditions. We present a novel approach named InterControl for flexible spatial\ncontrol of every joint in every person at any time by leveraging motion\ndiffusion model only trained on single-person data. We incorporate a motion\ncontrolnet to generate coherent and realistic motions given sparse spatial\ncontrol signals and a loss guidance module to precisely align any joint to the\ndesired position in a classifier guidance manner via Inverse Kinematics (IK).\nExtensive experiments on HumanML3D and KIT-ML dataset demonstrate its\neffectiveness in versatile joint control. We also collect data of joint contact\npairs by LLMs to show InterControl's ability in human interaction generation.\n","authors":["Zhenzhi Wang","Jingbo Wang","Dahua Lin","Bo Dai"],"pdf_url":"https://arxiv.org/pdf/2311.15864v1.pdf","comment":"Generate human interactions with only single-person motion diffusion\n model via LLM generated joint contact pairs, code\n https://github.com/zhenzhiwang/intercontrol"},{"id":"http://arxiv.org/abs/2311.15856v1","updated":"2023-11-27T14:23:36Z","published":"2023-11-27T14:23:36Z","title":"JSSL: Joint Supervised and Self-supervised Learning for MRI\n Reconstruction","summary":" Magnetic Resonance Imaging represents an important diagnostic modality;\nhowever, its inherently slow acquisition process poses challenges in obtaining\nfully sampled k-space data under motion in clinical scenarios such as\nabdominal, cardiac, and prostate imaging. In the absence of fully sampled\nacquisitions, which can serve as ground truth data, training deep learning\nalgorithms in a supervised manner to predict the underlying ground truth image\nbecomes an impossible task. To address this limitation, self-supervised methods\nhave emerged as a viable alternative, leveraging available subsampled k-space\ndata to train deep learning networks for MRI reconstruction. Nevertheless,\nthese self-supervised approaches often fall short when compared to supervised\nmethodologies. In this paper, we introduce JSSL (Joint Supervised and\nSelf-supervised Learning), a novel training approach for deep learning-based\nMRI reconstruction algorithms aimed at enhancing reconstruction quality in\nscenarios where target dataset(s) containing fully sampled k-space measurements\nare unavailable. Our proposed method operates by simultaneously training a\nmodel in a self-supervised learning setting, using subsampled data from the\ntarget dataset(s), and in a supervised learning manner, utilizing data from\nother datasets, referred to as proxy datasets, where fully sampled k-space data\nis accessible. To demonstrate the efficacy of JSSL, we utilized subsampled\nprostate parallel MRI measurements as the target dataset, while employing fully\nsampled brain and knee k-space acquisitions as proxy datasets. Our results\nshowcase a substantial improvement over conventional self-supervised training\nmethods, thereby underscoring the effectiveness of our joint approach. We\nprovide a theoretical motivation for JSSL and establish a practical\n\"rule-of-thumb\" for selecting the most appropriate training approach for deep\nMRI reconstruction.\n","authors":["George Yiasemis","Nikita Moriakov","Clara I. Sánchez","Jan-Jakob Sonke","Jonas Teuwen"],"pdf_url":"https://arxiv.org/pdf/2311.15856v1.pdf","comment":"26 pages, 11 figures, 6 tables"},{"id":"http://arxiv.org/abs/2311.15855v1","updated":"2023-11-27T14:22:07Z","published":"2023-11-27T14:22:07Z","title":"SiTH: Single-view Textured Human Reconstruction with Image-Conditioned\n Diffusion","summary":" A long-standing goal of 3D human reconstruction is to create lifelike and\nfully detailed 3D humans from single images. The main challenge lies in\ninferring unknown human shapes, clothing, and texture information in areas not\nvisible in the images. To address this, we propose SiTH, a novel pipeline that\nuniquely integrates an image-conditioned diffusion model into a 3D mesh\nreconstruction workflow. At the core of our method lies the decomposition of\nthe ill-posed single-view reconstruction problem into hallucination and\nreconstruction subproblems. For the former, we employ a powerful generative\ndiffusion model to hallucinate back appearances from the input images. For the\nlatter, we leverage skinned body meshes as guidance to recover full-body\ntexture meshes from the input and back-view images. Our designs enable training\nof the pipeline with only about 500 3D human scans while maintaining its\ngenerality and robustness. Extensive experiments and user studies on two 3D\nreconstruction benchmarks demonstrated the efficacy of our method in generating\nrealistic, fully textured 3D humans from a diverse range of unseen images.\n","authors":["Hsuan-I Ho","Jie Song","Otmar Hilliges"],"pdf_url":"https://arxiv.org/pdf/2311.15855v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2310.12877v3","updated":"2023-11-27T14:22:05Z","published":"2023-10-19T16:32:18Z","title":"Perceptual Assessment and Optimization of High Dynamic Range Image\n Rendering","summary":" The increasing popularity of high dynamic range (HDR) imaging stems from its\nability to faithfully capture luminance levels in natural scenes. However, HDR\nimage quality assessment has been insufficiently addressed. Existing models are\nmostly designed for low dynamic range (LDR) images, which exhibit poorly\ncorrelated with human perception of HDR image quality. To fill this gap, we\npropose a family of HDR quality metrics by transferring the recent advancements\nin LDR domain. The key step in our approach is to employ a simple inverse\ndisplay model to decompose an HDR image into a stack of LDR images with varying\nexposures. Subsequently, these LDR images are evaluated using state-of-the-art\nLDR quality metrics. Our family of HDR quality models offer three notable\nadvantages. First, specific exposures (i.e., luminance ranges) can be weighted\nto emphasize their assessment when calculating the overall quality score.\nSecond, our HDR quality metrics directly inherit the capabilities of their base\nLDR quality models in assessing LDR images. Third, our metrics do not rely on\nhuman perceptual data of HDR image quality for re-calibration. Experiments\nconducted on four human-rated HDR image quality datasets indicate that our HDR\nquality metrics consistently outperform existing methods, including the HDR-VDP\nfamily. Furthermore, we demonstrate the promise of our models in the perceptual\noptimization of HDR novel view synthesis.\n","authors":["Peibei Cao","Rafal K. Mantiuk","Kede Ma"],"pdf_url":"https://arxiv.org/pdf/2310.12877v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15851v1","updated":"2023-11-27T14:17:41Z","published":"2023-11-27T14:17:41Z","title":"Single-Model and Any-Modality for Video Object Tracking","summary":" In the realm of video object tracking, auxiliary modalities such as depth,\nthermal, or event data have emerged as valuable assets to complement the RGB\ntrackers. In practice, most existing RGB trackers learn a single set of\nparameters to use them across datasets and applications. However, a similar\nsingle-model unification for multi-modality tracking presents several\nchallenges. These challenges stem from the inherent heterogeneity of inputs --\neach with modality-specific representations, the scarcity of multi-modal\ndatasets, and the absence of all the modalities at all times. In this work, we\nintroduce Un-Track, a \\underline{Un}ified Tracker of a single set of parameters\nfor any modality. To handle any modality, our method learns their common latent\nspace through low-rank factorization and reconstruction techniques. More\nimportantly, we use only the RGB-X pairs to learn the common latent space. This\nunique shared representation seamlessly binds all modalities together, enabling\neffective unification and accommodating any missing modality, all within a\nsingle transformer-based architecture and without the need for\nmodality-specific fine-tuning. Our Un-Track achieves +8.1 absolute F-score\ngain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) GFLOPs\nwith +6.6M (over 93M) parameters, through a simple yet efficient prompting\nstrategy. Extensive comparisons on five benchmark datasets with different\nmodalities show that Un-Track surpasses both SOTA unified trackers and\nmodality-specific finetuned counterparts, validating our effectiveness and\npracticality.\n","authors":["Zongwei Wu","Jilai Zheng","Xiangxuan Ren","Florin-Alexandru Vasluianu","Chao Ma","Danda Pani Paudel","Luc Van Gool","Radu Timofte"],"pdf_url":"https://arxiv.org/pdf/2311.15851v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15847v1","updated":"2023-11-27T14:12:51Z","published":"2023-11-27T14:12:51Z","title":"Cell Maps Representation For Lung Adenocarcinoma Growth Patterns\n Classification In Whole Slide Images","summary":" Lung adenocarcinoma is a morphologically heterogeneous disease, characterized\nby five primary histologic growth patterns. The quantity of these patterns can\nbe related to tumor behavior and has a significant impact on patient prognosis.\nIn this work, we propose a novel machine learning pipeline capable of\nclassifying tissue tiles into one of the five patterns or as non-tumor, with an\nArea Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97.\nOur model's strength lies in its comprehensive consideration of cellular\nspatial patterns, where it first generates cell maps from Hematoxylin and Eosin\n(H&E) whole slide images (WSIs), which are then fed into a convolutional neural\nnetwork classification model. Exploiting these cell maps provides the model\nwith robust generalizability to new data, achieving approximately 30% higher\naccuracy on unseen test-sets compared to current state of the art approaches.\nThe insights derived from our model can be used to predict prognosis, enhancing\npatient outcomes.\n","authors":["Arwa Al-Rubaian","Gozde N. Gunesli","Wajd A. Althakfi","Ayesha Azam","Nasir Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2311.15847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15846v1","updated":"2023-11-27T14:11:54Z","published":"2023-11-27T14:11:54Z","title":"Learning with Noisy Low-Cost MOS for Image Quality Assessment via\n Dual-Bias Calibration","summary":" Learning based image quality assessment (IQA) models have obtained impressive\nperformance with the help of reliable subjective quality labels, where mean\nopinion score (MOS) is the most popular choice. However, in view of the\nsubjective bias of individual annotators, the labor-abundant MOS (LA-MOS)\ntypically requires a large collection of opinion scores from multiple\nannotators for each image, which significantly increases the learning cost. In\nthis paper, we aim to learn robust IQA models from low-cost MOS (LC-MOS), which\nonly requires very few opinion scores or even a single opinion score for each\nimage. More specifically, we consider the LC-MOS as the noisy observation of\nLA-MOS and enforce the IQA model learned from LC-MOS to approach the unbiased\nestimation of LA-MOS. In this way, we represent the subjective bias between\nLC-MOS and LA-MOS, and the model bias between IQA predictions learned from\nLC-MOS and LA-MOS (i.e., dual-bias) as two latent variables with unknown\nparameters. By means of the expectation-maximization based alternating\noptimization, we can jointly estimate the parameters of the dual-bias, which\nsuppresses the misleading of LC-MOS via a gated dual-bias calibration (GDBC)\nmodule. To the best of our knowledge, this is the first exploration of robust\nIQA model learning from noisy low-cost labels. Theoretical analysis and\nextensive experiments on four popular IQA datasets show that the proposed\nmethod is robust toward different bias rates and annotation numbers and\nsignificantly outperforms the other learning based IQA models when only LC-MOS\nis available. Furthermore, we also achieve comparable performance with respect\nto the other models learned with LA-MOS.\n","authors":["Lei Wang","Qingbo Wu","Desen Yuan","King Ngi Ngan","Hongliang Li","Fanman Meng","Linfeng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15846v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15841v1","updated":"2023-11-27T14:07:13Z","published":"2023-11-27T14:07:13Z","title":"Learning Disentangled Identifiers for Action-Customized Text-to-Image\n Generation","summary":" This study focuses on a novel task in text-to-image (T2I) generation, namely\naction customization. The objective of this task is to learn the co-existing\naction from limited data and generalize it to unseen humans or even animals.\nExperimental results show that existing subject-driven customization methods\nfail to learn the representative characteristics of actions and struggle in\ndecoupling actions from context features, including appearance. To overcome the\npreference for low-level features and the entanglement of high-level features,\nwe propose an inversion-based method Action-Disentangled Identifier (ADI) to\nlearn action-specific identifiers from the exemplar images. ADI first expands\nthe semantic conditioning space by introducing layer-wise identifier tokens,\nthereby increasing the representational richness while distributing the\ninversion across different features. Then, to block the inversion of\naction-agnostic features, ADI extracts the gradient invariance from the\nconstructed sample triples and masks the updates of irrelevant channels. To\ncomprehensively evaluate the task, we present an ActionBench that includes a\nvariety of actions, each accompanied by meticulously selected samples. Both\nquantitative and qualitative results show that our ADI outperforms existing\nbaselines in action-customized T2I generation.\n","authors":["Siteng Huang","Biao Gong","Yutong Feng","Xi Chen","Yuqian Fu","Yu Liu","Donglin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15841v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15836v1","updated":"2023-11-27T13:59:53Z","published":"2023-11-27T13:59:53Z","title":"Syn3DWound: A Synthetic Dataset for 3D Wound Bed Analysis","summary":" Wound management poses a significant challenge, particularly for bedridden\npatients and the elderly. Accurate diagnostic and healing monitoring can\nsignificantly benefit from modern image analysis, providing accurate and\nprecise measurements of wounds. Despite several existing techniques, the\nshortage of expansive and diverse training datasets remains a significant\nobstacle to constructing machine learning-based frameworks. This paper\nintroduces Syn3DWound, an open-source dataset of high-fidelity simulated wounds\nwith 2D and 3D annotations. We propose baseline methods and a benchmarking\nframework for automated 3D morphometry analysis and 2D/3D wound segmentation.\n","authors":["Léo Lebrat","Rodrigo Santa Cruz","Remi Chierchia","Yulia Arzhaeva","Mohammad Ali Armin","Joshua Goldsmith","Jeremy Oorloff","Prithvi Reddy","Chuong Nguyen","Lars Petersson","Michelle Barakat-Johnson","Georgina Luscombe","Clinton Fookes","Olivier Salvado","David Ahmedt-Aristizabal"],"pdf_url":"https://arxiv.org/pdf/2311.15836v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15830v1","updated":"2023-11-27T13:53:53Z","published":"2023-11-27T13:53:53Z","title":"A-JEPA: Joint-Embedding Predictive Architecture Can Listen","summary":" This paper presents that the masked-modeling principle driving the success of\nlarge foundational vision models can be effectively applied to audio by making\npredictions in a latent space. We introduce Audio-based Joint-Embedding\nPredictive Architecture (A-JEPA), a simple extension method for self-supervised\nlearning from the audio spectrum. Following the design of I-JPEA, our A-JEPA\nencodes visible audio spectrogram patches with a curriculum masking strategy\nvia context encoder, and predicts the representations of regions sampled at\nwell-designed locations. The target representations of those regions are\nextracted by the exponential moving average of context encoder, \\emph{i.e.},\ntarget encoder, on the whole spectrogram. We find it beneficial to transfer\nrandom block masking into time-frequency aware masking in a curriculum manner,\nconsidering the complexity of highly correlated in local time and frequency in\naudio spectrograms. To enhance contextual semantic understanding and\nrobustness, we fine-tune the encoder with a regularized masking on target\ndatasets, instead of input dropping or zero. Empirically, when built with\nVision Transformers structure, we find A-JEPA to be highly scalable and sets\nnew state-of-the-art performance on multiple audio and speech classification\ntasks, outperforming other recent models that use externally supervised\npre-training.\n","authors":["Zhengcong Fei","Mingyuan Fan","Junshi Huang"],"pdf_url":"https://arxiv.org/pdf/2311.15830v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14084v2","updated":"2023-11-27T13:43:19Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon has elevated the issue of source bias in text retrieval for web\nsearches. Specifically, neural retrieval models tend to rank generated texts\nhigher than human-written texts. In this paper, we extend the study of this\nbias to cross-modal retrieval. Firstly, we successfully construct a suitable\nbenchmark to explore the existence of the bias. Subsequent extensive\nexperiments on this benchmark reveal that AI-generated images introduce an\ninvisible relevance bias to text-image retrieval models. Specifically, our\nexperiments show that text-image retrieval models tend to rank the AI-generated\nimages higher than the real images, even though the AI-generated images do not\nexhibit more visually relevant features to the query than real images. This\ninvisible relevance bias is prevalent across retrieval models with varying\ntraining data and architectures. Furthermore, our subsequent exploration\nreveals that the inclusion of AI-generated images in the training data of the\nretrieval models exacerbates the invisible relevance bias. The above phenomenon\ntriggers a vicious cycle, which makes the invisible relevance bias become more\nand more serious. To elucidate the potential causes of invisible relevance and\naddress the aforementioned issues, we introduce an effective training method\naimed at alleviating the invisible relevance bias. Subsequently, we apply our\nproposed debiasing method to retroactively identify the causes of invisible\nrelevance, revealing that the AI-generated images induce the image encoder to\nembed additional information into their representation. This information\nexhibits a certain consistency across generated images with different semantics\nand can make the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.15813v1","updated":"2023-11-27T13:39:44Z","published":"2023-11-27T13:39:44Z","title":"FlowZero: Zero-Shot Text-to-Video Synthesis with LLM-Driven Dynamic\n Scene Syntax","summary":" Text-to-video (T2V) generation is a rapidly growing research area that aims\nto translate the scenes, objects, and actions within complex video text into a\nsequence of coherent visual frames. We present FlowZero, a novel framework that\ncombines Large Language Models (LLMs) with image diffusion models to generate\ntemporally-coherent videos. FlowZero uses LLMs to understand complex\nspatio-temporal dynamics from text, where LLMs can generate a comprehensive\ndynamic scene syntax (DSS) containing scene descriptions, object layouts, and\nbackground motion patterns. These elements in DSS are then used to guide the\nimage diffusion model for video generation with smooth object motions and\nframe-to-frame coherence. Moreover, FlowZero incorporates an iterative\nself-refinement process, enhancing the alignment between the spatio-temporal\nlayouts and the textual prompts for the videos. To enhance global coherence, we\npropose enriching the initial noise of each frame with motion dynamics to\ncontrol the background movement and camera motion adaptively. By using\nspatio-temporal syntaxes to guide the diffusion process, FlowZero achieves\nimprovement in zero-shot video synthesis, generating coherent videos with vivid\nmotion.\n","authors":["Yu Lu","Linchao Zhu","Hehe Fan","Yi Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15813v1.pdf","comment":"Project page: https://flowzero-video.github.io"},{"id":"http://arxiv.org/abs/2310.12190v2","updated":"2023-11-27T13:36:04Z","published":"2023-10-18T14:42:16Z","title":"DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors","summary":" Animating a still image offers an engaging visual experience. Traditional\nimage animation techniques mainly focus on animating natural scenes with\nstochastic dynamics (e.g. clouds and fluid) or domain-specific motions (e.g.\nhuman hair or body motions), and thus limits their applicability to more\ngeneral visual content. To overcome this limitation, we explore the synthesis\nof dynamic content for open-domain images, converting them into animated\nvideos. The key idea is to utilize the motion prior of text-to-video diffusion\nmodels by incorporating the image into the generative process as guidance.\nGiven an image, we first project it into a text-aligned rich context\nrepresentation space using a query transformer, which facilitates the video\nmodel to digest the image content in a compatible fashion. However, some visual\ndetails still struggle to be preserved in the resultant videos. To supplement\nwith more precise image information, we further feed the full image to the\ndiffusion model by concatenating it with the initial noises. Experimental\nresults show that our proposed method can produce visually convincing and more\nlogical & natural motions, as well as higher conformity to the input image.\nComparative evaluation demonstrates the notable superiority of our approach\nover existing competitors.\n","authors":["Jinbo Xing","Menghan Xia","Yong Zhang","Haoxin Chen","Wangbo Yu","Hanyuan Liu","Xintao Wang","Tien-Tsin Wong","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.12190v2.pdf","comment":"Project page: https://doubiiu.github.io/projects/DynamiCrafter"},{"id":"http://arxiv.org/abs/2311.15812v1","updated":"2023-11-27T13:35:20Z","published":"2023-11-27T13:35:20Z","title":"C-SAW: Self-Supervised Prompt Learning for Image Generalization in\n Remote Sensing","summary":" We focus on domain and class generalization problems in analyzing optical\nremote sensing images, using the large-scale pre-trained vision-language model\n(VLM), CLIP. While contrastively trained VLMs show impressive zero-shot\ngeneralization performance, their effectiveness is limited when dealing with\ndiverse domains during training and testing. Existing prompt learning\ntechniques overlook the importance of incorporating domain and content\ninformation into the prompts, which results in a drop in performance while\ndealing with such multi-domain data. To address these challenges, we propose a\nsolution that ensures domain-invariant prompt learning while enhancing the\nexpressiveness of visual features. We observe that CLIP's vision encoder\nstruggles to identify contextual image information, particularly when image\npatches are jumbled up. This issue is especially severe in optical remote\nsensing images, where land-cover classes exhibit well-defined contextual\nappearances. To this end, we introduce C-SAW, a method that complements CLIP\nwith a self-supervised loss in the visual space and a novel prompt learning\ntechnique that emphasizes both visual domain and content-specific features. We\nkeep the CLIP backbone frozen and introduce a small set of projectors for both\nthe CLIP encoders to train C-SAW contrastively. Experimental results\ndemonstrate the superiority of C-SAW across multiple remote sensing benchmarks\nand different generalization tasks.\n","authors":["Avigyan Bhattacharya","Mainak Singha","Ankit Jha","Biplab Banerjee"],"pdf_url":"https://arxiv.org/pdf/2311.15812v1.pdf","comment":"Accepted in ACM ICVGIP 2023"},{"id":"http://arxiv.org/abs/2311.15806v1","updated":"2023-11-27T13:29:34Z","published":"2023-11-27T13:29:34Z","title":"PIPE : Parallelized Inference Through Post-Training Quantization\n Ensembling of Residual Expansions","summary":" Deep neural networks (DNNs) are ubiquitous in computer vision and natural\nlanguage processing, but suffer from high inference cost. This problem can be\naddressed by quantization, which consists in converting floating point\nperations into a lower bit-width format. With the growing concerns on privacy\nrights, we focus our efforts on data-free methods. However, such techniques\nsuffer from their lack of adaptability to the target devices, as a hardware\ntypically only support specific bit widths. Thus, to adapt to a variety of\ndevices, a quantization method shall be flexible enough to find good accuracy\nv.s. speed trade-offs for every bit width and target device. To achieve this,\nwe propose PIPE, a quantization method that leverages residual error expansion,\nalong with group sparsity and an ensemble approximation for better\nparallelization. PIPE is backed off by strong theoretical guarantees and\nachieves superior performance on every benchmarked application (from vision to\nNLP tasks), architecture (ConvNets, transformers) and bit-width (from int8 to\nternary quantization).\n","authors":["Edouard Yvinec","Arnaud Dapogny","Kevin Bailly"],"pdf_url":"https://arxiv.org/pdf/2311.15806v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2203.14645"},{"id":"http://arxiv.org/abs/2311.15803v1","updated":"2023-11-27T13:25:47Z","published":"2023-11-27T13:25:47Z","title":"SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using\n Neural Radiance Fields","summary":" In rapidly-evolving domains such as autonomous driving, the use of multiple\nsensors with different modalities is crucial to ensure high operational\nprecision and stability. To correctly exploit the provided information by each\nsensor in a single common frame, it is essential for these sensors to be\naccurately calibrated. In this paper, we leverage the ability of Neural\nRadiance Fields (NeRF) to represent different sensors modalities in a common\nvolumetric representation to achieve robust and accurate spatio-temporal sensor\ncalibration. By designing a partitioning approach based on the visible part of\nthe scene for each sensor, we formulate the calibration problem using only the\noverlapping areas. This strategy results in a more robust and accurate\ncalibration that is less prone to failure. We demonstrate that our approach\nworks on outdoor urban scenes by validating it on multiple established driving\ndatasets. Results show that our method is able to get better accuracy and\nrobustness compared to existing methods.\n","authors":["Quentin Herau","Nathan Piasco","Moussab Bennehar","Luis Roldão","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2311.15803v1.pdf","comment":"Paper + Supplementary, under review"},{"id":"http://arxiv.org/abs/2209.07042v5","updated":"2023-11-27T13:18:28Z","published":"2022-09-15T04:51:17Z","title":"Efficient Perception, Planning, and Control Algorithms for Vision-Based\n Automated Vehicles","summary":" Autonomous vehicles have limited computational resources; hence, their\ncontrol systems must be efficient. The cost and size of sensors have limited\nthe development of self-driving cars. To overcome these restrictions, this\nstudy proposes an efficient framework for the operation of vision-based\nautomatic vehicles; the framework requires only a monocular camera and a few\ninexpensive radars. The proposed algorithm comprises a multi-task UNet (MTUNet)\nnetwork for extracting image features and constrained iterative linear\nquadratic regulator (CILQR) and vision predictive control (VPC) modules for\nrapid motion planning and control. MTUNet is designed to simultaneously solve\nlane line segmentation, the ego vehicle's heading angle regression, road type\nclassification, and traffic object detection tasks at approximately 40 FPS\n(frames per second) for 228 x 228 pixel RGB input images. The CILQR controllers\nthen use the MTUNet outputs and radar data as inputs to produce driving\ncommands for lateral and longitudinal vehicle guidance within only 1 ms. In\nparticular, the VPC algorithm is included to reduce steering command latency to\nbelow actuator latency to prevent self-driving vehicle performance degradation\nduring tight turns. The VPC algorithm uses road curvature data from MTUNet to\nestimate the correction of the current steering angle at a look-ahead point to\nadjust the turning amount. Including the VPC algorithm in a VPC-CILQR\ncontroller on curvy roads leads to higher performance than CILQR alone. Our\nexperiments demonstrate that the proposed autonomous driving system, which does\nnot require high-definition maps, could be applied in current autonomous\nvehicles.\n","authors":["Der-Hau Lee"],"pdf_url":"https://arxiv.org/pdf/2209.07042v5.pdf","comment":"10 figures, 13 pages"},{"id":"http://arxiv.org/abs/2310.03335v2","updated":"2023-11-27T13:18:11Z","published":"2023-10-05T06:35:21Z","title":"Continual Test-time Domain Adaptation via Dynamic Sample Selection","summary":" The objective of Continual Test-time Domain Adaptation (CTDA) is to gradually\nadapt a pre-trained model to a sequence of target domains without accessing the\nsource data. This paper proposes a Dynamic Sample Selection (DSS) method for\nCTDA. DSS consists of dynamic thresholding, positive learning, and negative\nlearning processes. Traditionally, models learn from unlabeled unknown\nenvironment data and equally rely on all samples' pseudo-labels to update their\nparameters through self-training. However, noisy predictions exist in these\npseudo-labels, so all samples are not equally trustworthy. Therefore, in our\nmethod, a dynamic thresholding module is first designed to select suspected\nlow-quality from high-quality samples. The selected low-quality samples are\nmore likely to be wrongly predicted. Therefore, we apply joint positive and\nnegative learning on both high- and low-quality samples to reduce the risk of\nusing wrong information. We conduct extensive experiments that demonstrate the\neffectiveness of our proposed method for CTDA in the image domain,\noutperforming the state-of-the-art results. Furthermore, our approach is also\nevaluated in the 3D point cloud domain, showcasing its versatility and\npotential for broader applicability.\n","authors":["Yanshuo Wang","Jie Hong","Ali Cheraghian","Shafin Rahman","David Ahmedt-Aristizabal","Lars Petersson","Mehrtash Harandi"],"pdf_url":"https://arxiv.org/pdf/2310.03335v2.pdf","comment":"2024 IEEE/CVF Winter Conference on Applications of Computer Vision"},{"id":"http://arxiv.org/abs/2304.02970v4","updated":"2023-11-27T13:11:20Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Segmentation","summary":" Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10594v2","updated":"2023-11-27T13:02:06Z","published":"2023-03-19T07:53:31Z","title":"AdaptGuard: Defending Against Universal Attacks for Model Adaptation","summary":" Model adaptation aims at solving the domain transfer problem under the\nconstraint of only accessing the pretrained source models. With the increasing\nconsiderations of data privacy and transmission efficiency, this paradigm has\nbeen gaining recent popularity. This paper studies the vulnerability to\nuniversal attacks transferred from the source domain during model adaptation\nalgorithms due to the existence of malicious providers. We explore both\nuniversal adversarial perturbations and backdoor attacks as loopholes on the\nsource side and discover that they still survive in the target models after\nadaptation. To address this issue, we propose a model preprocessing framework,\nnamed AdaptGuard, to improve the security of model adaptation algorithms.\nAdaptGuard avoids direct use of the risky source parameters through knowledge\ndistillation and utilizes the pseudo adversarial samples under adjusted radius\nto enhance the robustness. AdaptGuard is a plug-and-play module that requires\nneither robust pretrained models nor any changes for the following model\nadaptation algorithms. Extensive results on three commonly used datasets and\ntwo popular adaptation methods validate that AdaptGuard can effectively defend\nagainst universal attacks and maintain clean accuracy in the target domain\nsimultaneously. We hope this research will shed light on the safety and\nrobustness of transfer learning. Code is available at\nhttps://github.com/TomSheng21/AdaptGuard.\n","authors":["Lijun Sheng","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2303.10594v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2311.15782v1","updated":"2023-11-27T12:55:39Z","published":"2023-11-27T12:55:39Z","title":"Relationship between Model Compression and Adversarial Robustness: A\n Review of Current Evidence","summary":" Increasing the model capacity is a known approach to enhance the adversarial\nrobustness of deep learning networks. On the other hand, various model\ncompression techniques, including pruning and quantization, can reduce the size\nof the network while preserving its accuracy. Several recent studies have\naddressed the relationship between model compression and adversarial\nrobustness, while some experiments have reported contradictory results. This\nwork summarizes available evidence and discusses possible explanations for the\nobserved effects.\n","authors":["Svetlana Pavlitska","Hannes Grolig","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.15782v1.pdf","comment":"Accepted for publication at SSCI 2023"},{"id":"http://arxiv.org/abs/2311.15776v1","updated":"2023-11-27T12:51:42Z","published":"2023-11-27T12:51:42Z","title":"Stable Segment Anything Model","summary":" The Segment Anything Model (SAM) achieves remarkable promptable segmentation\ngiven high-quality prompts which, however, often require good skills to\nspecify. To make SAM robust to casual prompts, this paper presents the first\ncomprehensive analysis on SAM's segmentation stability across a diverse\nspectrum of prompt qualities, notably imprecise bounding boxes and insufficient\npoints. Our key finding reveals that given such low-quality prompts, SAM's mask\ndecoder tends to activate image features that are biased towards the background\nor confined to specific object parts. To mitigate this issue, our key idea\nconsists of adjusting the sampling locations of image feature using learnable\ndeformable offsets, while the original SAM model architecture and weights\nremain unchanged. Consequently, our deformable sampling plugin (DSP) enables\nSAM to adaptively shift attention to the prompted target regions in a\ndata-driven manner, facilitated by our effective robust training strategy\n(RTS). During inference, dynamic routing plugin (DRP) is proposed that toggles\nSAM between the deformable and regular grid sampling modes, conditioned on the\ninput prompt quality. Thus, our solution, termed Stable-SAM, is one of its kind\nfocusing on solely adjusting feature sampling locations, which offers several\nadvantages: 1) improved SAM's segmentation stability across a wide range of\nprompt qualities, while 2) retaining SAM's powerful promptable segmentation\nefficiency and generality, with 3) minimal learnable parameters (0.08 M) and\nfast adaptation (by 1 training epoch). Extensive experiments across multiple\ndatasets validate the effectiveness and advantages of our approach,\nunderscoring Stable-SAM as a more robust solution for segmenting anything.\nCodes will be released upon acceptance.\n","authors":["Qi Fan","Xin Tao","Lei Ke","Mingqiao Ye","Yuan Zhang","Pengfei Wan","Zhongyuan Wang","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2311.15776v1.pdf","comment":"Codes will be released upon acceptance"},{"id":"http://arxiv.org/abs/2305.19599v3","updated":"2023-11-27T12:50:09Z","published":"2023-05-31T06:59:21Z","title":"RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine\n Semantic Re-alignment","summary":" Recent advances in text-to-image diffusion models have achieved remarkable\nsuccess in generating high-quality, realistic images from textual descriptions.\nHowever, these approaches have faced challenges in precisely aligning the\ngenerated visual content with the textual concepts described in the prompts. In\nthis paper, we propose a two-stage coarse-to-fine semantic re-alignment method,\nnamed RealignDiff, aimed at improving the alignment between text and images in\ntext-to-image diffusion models. In the coarse semantic re-alignment phase, a\nnovel caption reward, leveraging the BLIP-2 model, is proposed to evaluate the\nsemantic discrepancy between the generated image caption and the given text\nprompt. Subsequently, the fine semantic re-alignment stage employs a local\ndense caption generation module and a re-weighting attention modulation module\nto refine the previously generated images from a local semantic view.\nExperimental results on the MS-COCO benchmark demonstrate that the proposed\ntwo-stage coarse-to-fine semantic re-alignment method outperforms other\nbaseline re-alignment techniques by a substantial margin in both visual quality\nand semantic similarity with the input prompt.\n","authors":["Guian Fang","Zutao Jiang","Jianhua Han","Guansong Lu","Hang Xu","Shengcai Liao","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2305.19599v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15773v1","updated":"2023-11-27T12:48:33Z","published":"2023-11-27T12:48:33Z","title":"Check, Locate, Rectify: A Training-Free Layout Calibration System for\n Text-to-Image Generation","summary":" Diffusion models have recently achieved remarkable progress in generating\nrealistic images. However, challenges remain in accurately understanding and\nsynthesizing the layout requirements in the textual prompts. To align the\ngenerated image with layout instructions, we present a training-free layout\ncalibration system SimM that intervenes in the generative process on the fly\nduring inference time. Specifically, following a \"check-locate-rectify\"\npipeline, the system first analyses the prompt to generate the target layout\nand compares it with the intermediate outputs to automatically detect errors.\nThen, by moving the located activations and making intra- and inter-map\nadjustments, the rectification process can be performed with negligible\ncomputational overhead. To evaluate SimM over a range of layout requirements,\nwe present a benchmark SimMBench that compensates for the lack of superlative\nspatial relations in existing datasets. And both quantitative and qualitative\nresults demonstrate the effectiveness of the proposed SimM in calibrating the\nlayout inconsistencies.\n","authors":["Biao Gong","Siteng Huang","Yutong Feng","Shiwei Zhang","Yuyuan Li","Yu Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15773v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14456v4","updated":"2023-11-27T12:45:22Z","published":"2022-11-26T02:15:35Z","title":"TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis","summary":" In many practical applications, 3D point cloud analysis requires rotation\ninvariance. In this paper, we present a learnable descriptor invariant under 3D\nrotations and reflections, i.e., the O(3) actions, utilizing the recently\nintroduced steerable 3D spherical neurons and vector neurons. Specifically, we\npropose an embedding of the 3D spherical neurons into 4D vector neurons, which\nleverages end-to-end training of the model. In our approach, we perform\nTetraTransform--an equivariant embedding of the 3D input into 4D, constructed\nfrom the steerable neurons--and extract deeper O(3)-equivariant features using\nvector neurons. This integration of the TetraTransform into the VN-DGCNN\nframework, termed TetraSphere, negligibly increases the number of parameters by\nless than 0.0002%. TetraSphere sets a new state-of-the-art performance\nclassifying randomly rotated real-world object scans of the challenging subsets\nof ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods\non randomly rotated synthetic data: classifying objects from ModelNet40 and\nsegmenting parts of the ShapeNet shapes. Thus, our results reveal the practical\nvalue of steerable 3D spherical neurons for learning in 3D Euclidean space.\n","authors":["Pavlo Melnyk","Andreas Robinson","Michael Felsberg","Mårten Wadenbäck"],"pdf_url":"https://arxiv.org/pdf/2211.14456v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15769v1","updated":"2023-11-27T12:39:42Z","published":"2023-11-27T12:39:42Z","title":"Side4Video: Spatial-Temporal Side Network for Memory-Efficient\n Image-to-Video Transfer Learning","summary":" Large pre-trained vision models achieve impressive success in computer\nvision. However, fully fine-tuning large models for downstream tasks,\nparticularly in video understanding, can be prohibitively computationally\nexpensive. Recent studies turn their focus towards efficient image-to-video\ntransfer learning. Nevertheless, existing efficient fine-tuning methods lack\nattention to training memory usage and exploration of transferring a larger\nmodel to the video domain. In this paper, we present a novel Spatial-Temporal\nSide Network for memory-efficient fine-tuning large image models to video\nunderstanding, named Side4Video. Specifically, we introduce a lightweight\nspatial-temporal side network attached to the frozen vision model, which avoids\nthe backpropagation through the heavy pre-trained model and utilizes\nmulti-level spatial features from the original image model. Extremely\nmemory-efficient architecture enables our method to reduce 75% memory usage\nthan previous adapter-based methods. In this way, we can transfer a huge ViT-E\n(4.4B) for video understanding tasks which is 14x larger than ViT-L (304M). Our\napproach achieves remarkable performance on various video datasets across\nunimodal and cross-modal tasks (i.e., action recognition and text-video\nretrieval), especially in Something-Something V1&V2 (67.3% & 74.6%),\nKinetics-400 (88.6%), MSR-VTT (52.3%), MSVD (56.1%) and VATEX (68.8%). We\nrelease our code at https://github.com/HJYao00/Side4Video.\n","authors":["Huanjin Yao","Wenhao Wu","Zhiheng Li"],"pdf_url":"https://arxiv.org/pdf/2311.15769v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2311.15759v1","updated":"2023-11-27T12:29:20Z","published":"2023-11-27T12:29:20Z","title":"Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage\n and Sharing in LLMs","summary":" Recent advancements in multimodal large language models (MLLMs) have achieved\nsignificant multimodal generation capabilities, akin to GPT-4. These models\npredominantly map visual information into language representation space,\nleveraging the vast knowledge and powerful text generation abilities of LLMs to\nproduce multimodal instruction-following responses. We could term this method\nas LLMs for Vision because of its employing LLMs for visual-language\nunderstanding, yet observe that these MLLMs neglect the potential of harnessing\nvisual knowledge to enhance overall capabilities of LLMs, which could be\nregraded as Vision Enhancing LLMs. In this paper, we propose an approach called\nMKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage\nand Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a\ncomponent integrated into the internal blocks of LLMs, designed to store\nopen-world visual information efficiently. Additionally, we present a soft\nMixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal\nknowledge collaboration during generation. Our comprehensive experiments\ndemonstrate that MKS2 substantially augments the reasoning capabilities of LLMs\nin contexts necessitating physical or commonsense knowledge. It also delivers\ncompetitive results on multimodal benchmarks.\n","authors":["Yunxin Li","Baotian Hu","Wei Wang","Xiaochun Cao","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15759v1.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.02833v2","updated":"2023-11-27T12:10:09Z","published":"2023-04-06T02:45:39Z","title":"DoUnseen: Tuning-Free Class-Adaptive Object Detection of Unseen Objects\n for Robotic Grasping","summary":" How can we segment varying numbers of objects where each specific object\nrepresents its own separate class? To make the problem even more realistic, how\ncan we add and delete classes on the fly without retraining or fine-tuning?\nThis is the case of robotic applications where no datasets of the objects exist\nor application that includes thousands of objects (E.g., in logistics) where it\nis impossible to train a single model to learn all of the objects. Most current\nresearch on object segmentation for robotic grasping focuses on class-level\nobject segmentation (E.g., box, cup, bottle), closed sets (specific objects of\na dataset; for example, YCB dataset), or deep learning-based template matching.\nIn this work, we are interested in open sets where the number of classes is\nunknown, varying, and without pre-knowledge about the objects' types. We\nconsider each specific object as its own separate class. Our goal is to develop\nan object detector that requires no fine-tuning and can add any object as a\nclass just by capturing a few images of the object. Our main idea is to break\nthe segmentation pipelines into two steps by combining unseen object\nsegmentation networks cascaded by class-adaptive classifiers. We evaluate our\nclass-adaptive object detector on unseen datasets and compare it to a trained\nMask R-CNN on those datasets. The results show that the performance varies from\npractical to unsuitable depending on the environment setup and the objects\nbeing handled. The code is available in our DoUnseen library repository.\n","authors":["Anas Gouda","Moritz Roidl"],"pdf_url":"https://arxiv.org/pdf/2304.02833v2.pdf","comment":"presented at RSS 2023 Workshop on Perception and Manipulation\n Challenges for Warehouse Automation"},{"id":"http://arxiv.org/abs/2311.15751v1","updated":"2023-11-27T12:08:46Z","published":"2023-11-27T12:08:46Z","title":"PyNanospacing: TEM image processing tool for strain analysis and\n visualization","summary":" The diverse spectrum of material characteristics including band gap,\nmechanical moduli, color, phonon and electronic density of states, along with\ncatalytic and surface properties are intricately intertwined with the atomic\nstructure and the corresponding interatomic bond-lengths. This interconnection\nextends to the manifestation of interplanar spacings within a crystalline\nlattice. Analysis of these interplanar spacings and the comprehension of any\ndeviations, whether it be lattice compression or expansion, commonly referred\nto as strain, hold paramount significance in unraveling various unknowns within\nthe field. Transmission Electron Microscopy (TEM) is widely used to capture\natomic-scale ordering, facilitating direct investigation of interplanar\nspacings. However, creating critical contour maps for visualizing and\ninterpreting lattice stresses in TEM images remains a challenging task. Here we\ndeveloped a Python code for TEM image processing that can handle a wide range\nof materials including nanoparticles, 2D materials, pure crystals and solid\nsolutions. This algorithm converts local differences in interplanar spacings\ninto contour maps allowing for a visual representation of lattice expansion and\ncompression. The tool is very generic and can significantly aid in analyzing\nmaterial properties using TEM images, allowing for a more in-depth exploration\nof the underlying science behind strain engineering via strain contour maps at\nthe atomic level.\n","authors":["Mehmet Ali Sarsil","Mubashir Mansoor","Mert Saracoglu","Servet Timur","Mustafa Urgen","Onur Ergen"],"pdf_url":"https://arxiv.org/pdf/2311.15751v1.pdf","comment":"Preprint, 13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.15744v1","updated":"2023-11-27T12:02:42Z","published":"2023-11-27T12:02:42Z","title":"One More Step: A Versatile Plug-and-Play Module for Rectifying Diffusion\n Schedule Flaws and Enhancing Low-Frequency Controls","summary":" It is well known that many open-released foundational diffusion models have\ndifficulty in generating images that substantially depart from average\nbrightness, despite such images being present in the training data. This is due\nto an inconsistency: while denoising starts from pure Gaussian noise during\ninference, the training noise schedule retains residual data even in the final\ntimestep distribution, due to difficulties in numerical conditioning in\nmainstream formulation, leading to unintended bias during inference. To\nmitigate this issue, certain $\\epsilon$-prediction models are combined with an\nad-hoc offset-noise methodology. In parallel, some contemporary models have\nadopted zero-terminal SNR noise schedules together with\n$\\mathbf{v}$-prediction, which necessitate major alterations to pre-trained\nmodels. However, such changes risk destabilizing a large multitude of\ncommunity-driven applications anchored on these pre-trained models. In light of\nthis, our investigation revisits the fundamental causes, leading to our\nproposal of an innovative and principled remedy, called One More Step (OMS). By\nintegrating a compact network and incorporating an additional simple yet\neffective step during inference, OMS elevates image fidelity and harmonizes the\ndichotomy between training and inference, while preserving original model\nparameters. Once trained, various pre-trained diffusion models with the same\nlatent domain can share the same OMS module.\n","authors":["Minghui Hu","Jianbin Zheng","Chuanxia Zheng","Chaoyue Wang","Dacheng Tao","Tat-Jen Cham"],"pdf_url":"https://arxiv.org/pdf/2311.15744v1.pdf","comment":"Project Page: https://jabir-zheng.github.io/OneMoreStep/, Demo Page:\n https://huggingface.co/spaces/h1t/oms_sdxl_lcm"},{"id":"http://arxiv.org/abs/2311.15741v1","updated":"2023-11-27T11:46:30Z","published":"2023-11-27T11:46:30Z","title":"Machine Learning-Based Jamun Leaf Disease Detection: A Comprehensive\n Review","summary":" Jamun leaf diseases pose a significant threat to agricultural productivity,\nnegatively impacting both yield and quality in the jamun industry. The advent\nof machine learning has opened up new avenues for tackling these diseases\neffectively. Early detection and diagnosis are essential for successful crop\nmanagement. While no automated systems have yet been developed specifically for\njamun leaf disease detection, various automated systems have been implemented\nfor similar types of disease detection using image processing techniques. This\npaper presents a comprehensive review of machine learning methodologies\nemployed for diagnosing plant leaf diseases through image classification, which\ncan be adapted for jamun leaf disease detection. It meticulously assesses the\nstrengths and limitations of various Vision Transformer models, including\nTransfer learning model and vision transformer (TLMViT), SLViT, SE-ViT,\nIterationViT, Tiny-LeViT, IEM-ViT, GreenViT, and PMViT. Additionally, the paper\nreviews models such as Dense Convolutional Network (DenseNet), Residual Neural\nNetwork (ResNet)-50V2, EfficientNet, Ensemble model, Convolutional Neural\nNetwork (CNN), and Locally Reversible Transformer. These machine-learning\nmodels have been evaluated on various datasets, demonstrating their real-world\napplicability. This review not only sheds light on current advancements in the\nfield but also provides valuable insights for future research directions in\nmachine learning-based jamun leaf disease detection and classification.\n","authors":["Auvick Chandra Bhowmik","Dr. Md. Taimur Ahad","Yousuf Rayhan Emon"],"pdf_url":"https://arxiv.org/pdf/2311.15741v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15740v1","updated":"2023-11-27T11:44:46Z","published":"2023-11-27T11:44:46Z","title":"Optimization of Image Processing Algorithms for Character Recognition in\n Cultural Typewritten Documents","summary":" Linked Data is used in various fields as a new way of structuring and\nconnecting data. Cultural heritage institutions have been using linked data to\nimprove archival descriptions and facilitate the discovery of information. Most\narchival records have digital representations of physical artifacts in the form\nof scanned images that are non-machine-readable. Optical Character Recognition\n(OCR) recognizes text in images and translates it into machine-encoded text.\nThis paper evaluates the impact of image processing methods and parameter\ntuning in OCR applied to typewritten cultural heritage documents. The approach\nuses a multi-objective problem formulation to minimize Levenshtein edit\ndistance and maximize the number of words correctly identified with a\nnon-dominated sorting genetic algorithm (NSGA-II) to tune the methods'\nparameters. Evaluation results show that parameterization by digital\nrepresentation typology benefits the performance of image pre-processing\nalgorithms in OCR. Furthermore, our findings suggest that employing image\npre-processing algorithms in OCR might be more suitable for typologies where\nthe text recognition task without pre-processing does not produce good results.\nIn particular, Adaptive Thresholding, Bilateral Filter, and Opening are the\nbest-performing algorithms for the theatre plays' covers, letters, and overall\ndataset, respectively, and should be applied before OCR to improve its\nperformance.\n","authors":["Mariana Dias","Carla Teixeira Lopes"],"pdf_url":"https://arxiv.org/pdf/2311.15740v1.pdf","comment":"25 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.03017v3","updated":"2023-11-27T11:38:39Z","published":"2023-07-06T14:31:01Z","title":"RealLiFe: Real-Time Light Field Reconstruction via Hierarchical Sparse\n Gradient Descent","summary":" With the rise of Extended Reality (XR) technology, there is a growing need\nfor real-time light field generation from sparse view inputs. Existing methods\ncan be classified into offline techniques, which can generate high-quality\nnovel views but at the cost of long inference/training time, and online\nmethods, which either lack generalizability or produce unsatisfactory results.\nHowever, we have observed that the intrinsic sparse manifold of Multi-plane\nImages (MPI) enables a significant acceleration of light field generation while\nmaintaining rendering quality. Based on this insight, we introduce EffLiFe, a\nnovel light field optimization method, which leverages the proposed\nHierarchical Sparse Gradient Descent (HSGD) to produce high-quality light\nfields from sparse view images in real time. Technically, the coarse MPI of a\nscene is first generated using a 3D CNN, and it is further sparsely optimized\nby focusing only on important MPI gradients in a few iterations. Nevertheless,\nrelying solely on optimization can lead to artifacts at occlusion boundaries.\nTherefore, we propose an occlusion-aware iterative refinement module that\nremoves visual artifacts in occluded regions by iteratively filtering the\ninput. Extensive experiments demonstrate that our method achieves comparable\nvisual quality while being 100x faster on average than state-of-the-art offline\nmethods and delivering better performance (about 2 dB higher in PSNR) compared\nto other online approaches.\n","authors":["Yijie Deng","Lei Han","Tianpeng Lin","Lin Li","Jinzhi Zhang","Lu Fang"],"pdf_url":"https://arxiv.org/pdf/2307.03017v3.pdf","comment":"Submitted to IEEE TPAMI"},{"id":"http://arxiv.org/abs/2311.15732v1","updated":"2023-11-27T11:29:10Z","published":"2023-11-27T11:29:10Z","title":"GPT4Vis: What Can GPT-4 Do for Zero-shot Visual Recognition?","summary":" This paper does not present a novel method. Instead, it delves into an\nessential, yet must-know baseline in light of the latest advancements in\nGenerative Artificial Intelligence (GenAI): the utilization of GPT-4 for visual\nunderstanding. Our study centers on the evaluation of GPT-4's linguistic and\nvisual capabilities in zero-shot visual recognition tasks. Specifically, we\nexplore the potential of its generated rich textual descriptions across various\ncategories to enhance recognition performance without any training.\nAdditionally, we evaluate its visual proficiency in directly recognizing\ndiverse visual content. To achieve this, we conduct an extensive series of\nexperiments, systematically quantifying the performance of GPT-4 across three\nmodalities: images, videos, and point clouds. This comprehensive evaluation\nencompasses a total of 16 widely recognized benchmark datasets, providing top-1\nand top-5 accuracy metrics. Our study reveals that leveraging GPT-4's advanced\nlinguistic knowledge to generate rich descriptions markedly improves zero-shot\nrecognition. In terms of visual proficiency, GPT-4V's average performance\nacross 16 datasets sits roughly between the capabilities of OpenAI-CLIP's ViT-L\nand EVA-CLIP's ViT-E. We hope that this research will contribute valuable data\npoints and experience for future studies. We release our code at\nhttps://github.com/whwu95/GPT4Vis.\n","authors":["Wenhao Wu","Huanjin Yao","Mengxi Zhang","Yuxin Song","Wanli Ouyang","Jingdong Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15732v1.pdf","comment":"Technical report. Work in progress"},{"id":"http://arxiv.org/abs/2311.15728v1","updated":"2023-11-27T11:26:41Z","published":"2023-11-27T11:26:41Z","title":"Adinkra Symbol Recognition using Classical Machine Learning and Deep\n Learning","summary":" Artificial intelligence (AI) has emerged as a transformative influence,\nengendering paradigm shifts in global societies, spanning academia and\nindustry. However, in light of these rapid advances, addressing the\nunderrepresentation of black communities and African countries in AI is\ncrucial. Boosting enthusiasm for AI can be effectively accomplished by\nshowcasing straightforward applications around tasks like identifying and\ncategorizing traditional symbols, such as Adinkra symbols, or familiar objects\nwithin the community. In this research endeavor, we dived into classical\nmachine learning and harnessed the power of deep learning models to tackle the\nintricate task of classifying and recognizing Adinkra symbols. The idea led to\na newly constructed ADINKRA dataset comprising 174,338 images meticulously\norganized into 62 distinct classes, each representing a singular and emblematic\nsymbol. We constructed a CNN model for classification and recognition using six\nconvolutional layers, three fully connected (FC) layers, and optional dropout\nregularization. The model is a simpler and smaller version of VGG, with fewer\nlayers, smaller channel sizes, and a fixed kernel size. Additionally, we tap\ninto the transfer learning capabilities provided by pre-trained models like VGG\nand ResNet. These models assist us in both classifying images and extracting\nfeatures that can be used with classical machine learning models. We assess the\nmodel's performance by measuring its accuracy and convergence rate and\nvisualizing the areas that significantly influence its predictions. These\nevaluations serve as a foundational benchmark for future assessments of the\nADINKRA dataset. We hope this application exemplar inspires ideas on the\nvarious uses of AI in organizing our traditional and modern lives.\n","authors":["Michael Adjeisah","Kwame Omono Asamoah","Martha Asamoah Yeboah","Raji Rafiu King","Godwin Ferguson Achaab","Kingsley Adjei"],"pdf_url":"https://arxiv.org/pdf/2311.15728v1.pdf","comment":"15 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.15727v1","updated":"2023-11-27T11:24:25Z","published":"2023-11-27T11:24:25Z","title":"MARIS: Referring Image Segmentation via Mutual-Aware Attention Features","summary":" Referring image segmentation (RIS) aims to segment a particular region based\non a language expression prompt. Existing methods incorporate linguistic\nfeatures into visual features and obtain multi-modal features for mask\ndecoding. However, these methods may segment the visually salient entity\ninstead of the correct referring region, as the multi-modal features are\ndominated by the abundant visual context. In this paper, we propose MARIS, a\nreferring image segmentation method that leverages the Segment Anything Model\n(SAM) and introduces a mutual-aware attention mechanism to enhance the\ncross-modal fusion via two parallel branches. Specifically, our mutual-aware\nattention mechanism consists of Vision-Guided Attention and Language-Guided\nAttention, which bidirectionally model the relationship between visual and\nlinguistic features. Correspondingly, we design a Mask Decoder to enable\nexplicit linguistic guidance for more consistent segmentation with the language\nexpression. To this end, a multi-modal query token is proposed to integrate\nlinguistic information and interact with visual information simultaneously.\nExtensive experiments on three benchmark datasets show that our method\noutperforms the state-of-the-art RIS methods. Our code will be publicly\navailable.\n","authors":["Mengxi Zhang","Yiming Liu","Xiangjun Yin","Huanjing Yue","Jingyu Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15722v1","updated":"2023-11-27T11:17:20Z","published":"2023-11-27T11:17:20Z","title":"GLIME: General, Stable and Local LIME Explanation","summary":" As black-box machine learning models grow in complexity and find applications\nin high-stakes scenarios, it is imperative to provide explanations for their\npredictions. Although Local Interpretable Model-agnostic Explanations (LIME)\n[22] is a widely adpoted method for understanding model behaviors, it is\nunstable with respect to random seeds [35,24,3] and exhibits low local fidelity\n(i.e., how well the explanation approximates the model's local behaviors)\n[21,16]. Our study shows that this instability problem stems from small sample\nweights, leading to the dominance of regularization and slow convergence.\nAdditionally, LIME's sampling neighborhood is non-local and biased towards the\nreference, resulting in poor local fidelity and sensitivity to reference\nchoice. To tackle these challenges, we introduce GLIME, an enhanced framework\nextending LIME and unifying several prior methods. Within the GLIME framework,\nwe derive an equivalent formulation of LIME that achieves significantly faster\nconvergence and improved stability. By employing a local and unbiased sampling\ndistribution, GLIME generates explanations with higher local fidelity compared\nto LIME. GLIME explanations are independent of reference choice. Moreover,\nGLIME offers users the flexibility to choose a sampling distribution based on\ntheir specific scenarios.\n","authors":["Zeren Tan","Yang Tian","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2311.15722v1.pdf","comment":"Accepted by NeurIPS 2023 as a Spotlight paper"},{"id":"http://arxiv.org/abs/2311.15719v1","updated":"2023-11-27T11:12:33Z","published":"2023-11-27T11:12:33Z","title":"Variational Autoencoders for Feature Exploration and Malignancy\n Prediction of Lung Lesions","summary":" Lung cancer is responsible for 21% of cancer deaths in the UK and five-year\nsurvival rates are heavily influenced by the stage the cancer was identified\nat. Recent studies have demonstrated the capability of AI methods for accurate\nand early diagnosis of lung cancer from routine scans. However, this evidence\nhas not translated into clinical practice with one barrier being a lack of\ninterpretable models. This study investigates the application Variational\nAutoencoders (VAEs), a type of generative AI model, to lung cancer lesions.\nProposed models were trained on lesions extracted from 3D CT scans in the\nLIDC-IDRI public dataset. Latent vector representations of 2D slices produced\nby the VAEs were explored through clustering to justify their quality and used\nin an MLP classifier model for lung cancer diagnosis, the best model achieved\nstate-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows\nthe VAE latent space separates the dataset of malignant and benign lesions\nbased on meaningful feature components including tumour size, shape, patient\nand malignancy class. We also include a comparative analysis of the standard\nGaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces\nthe prior with a Dirichlet distribution to encourage a more explainable latent\nspace with disentangled feature representation. Finally, we demonstrate the\npotential for latent space traversals corresponding to clinically meaningful\nfeature changes.\n","authors":["Benjamin Keel","Aaron Quyn","David Jayne","Samuel D. Relton"],"pdf_url":"https://arxiv.org/pdf/2311.15719v1.pdf","comment":"10 pages (main paper), 5 pages (references), 5 figures, 2 tables,\n work accepted for BMVC 2023"},{"id":"http://arxiv.org/abs/2309.12303v3","updated":"2023-11-27T11:04:07Z","published":"2023-09-21T17:59:02Z","title":"PanoVOS: Bridging Non-panoramic and Panoramic Views with Transformer for\n Video Segmentation","summary":" Panoramic videos contain richer spatial information and have attracted\ntremendous amounts of attention due to their exceptional experience in some\nfields such as autonomous driving and virtual reality. However, existing\ndatasets for video segmentation only focus on conventional planar images. To\naddress the challenge, in this paper, we present a panoramic video dataset,\nPanoVOS. The dataset provides 150 videos with high video resolutions and\ndiverse motions. To quantify the domain gap between 2D planar videos and\npanoramic videos, we evaluate 15 off-the-shelf video object segmentation (VOS)\nmodels on PanoVOS. Through error analysis, we found that all of them fail to\ntackle pixel-level content discontinues of panoramic videos. Thus, we present a\nPanoramic Space Consistency Transformer (PSCFormer), which can effectively\nutilize the semantic boundary information of the previous frame for pixel-level\nmatching with the current frame. Extensive experiments demonstrate that\ncompared with the previous SOTA models, our PSCFormer network exhibits a great\nadvantage in terms of segmentation results under the panoramic setting. Our\ndataset poses new challenges in panoramic VOS and we hope that our PanoVOS can\nadvance the development of panoramic segmentation/tracking.\n","authors":["Shilin Yan","Xiaohao Xu","Renrui Zhang","Lingyi Hong","Wenchao Chen","Wenqiang Zhang","Wei Zhang"],"pdf_url":"https://arxiv.org/pdf/2309.12303v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15707v1","updated":"2023-11-27T10:50:47Z","published":"2023-11-27T10:50:47Z","title":"SAM-6D: Segment Anything Model Meets Zero-Shot 6D Object Pose Estimation","summary":" Zero-shot 6D object pose estimation involves the detection of novel objects\nwith their 6D poses in cluttered scenes, presenting significant challenges for\nmodel generalizability. Fortunately, the recent Segment Anything Model (SAM)\nhas showcased remarkable zero-shot transfer performance, which provides a\npromising solution to tackle this task. Motivated by this, we introduce SAM-6D,\na novel framework designed to realize the task through two steps, including\ninstance segmentation and pose estimation. Given the target objects, SAM-6D\nemploys two dedicated sub-networks, namely Instance Segmentation Model (ISM)\nand Pose Estimation Model (PEM), to perform these steps on cluttered RGB-D\nimages. ISM takes SAM as an advanced starting point to generate all possible\nobject proposals and selectively preserves valid ones through meticulously\ncrafted object matching scores in terms of semantics, appearance and geometry.\nBy treating pose estimation as a partial-to-partial point matching problem, PEM\nperforms a two-stage point matching process featuring a novel design of\nbackground tokens to construct dense 3D-3D correspondence, ultimately yielding\nthe pose estimates. Without bells and whistles, SAM-6D outperforms the existing\nmethods on the seven core datasets of the BOP Benchmark for both instance\nsegmentation and pose estimation of novel objects.\n","authors":["Jiehong Lin","Lihua Liu","Dekun Lu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2311.15707v1.pdf","comment":"Github Page: https://github.com/JiehongLin/SAM-6D"},{"id":"http://arxiv.org/abs/2309.17389v4","updated":"2023-11-27T10:46:03Z","published":"2023-09-29T16:50:38Z","title":"Prompt-based test-time real image dehazing: a novel pipeline","summary":" Existing methods attempt to improve models' generalization ability on\nreal-world hazy images by exploring well-designed training schemes (e.g.,\nCycleGAN, prior loss). However, most of them need very complicated training\nprocedures to achieve satisfactory results. In this work, we present a totally\nnovel testing pipeline called Prompt-based Test-Time Dehazing (PTTD) to help\ngenerate visually pleasing results of real-captured hazy images during the\ninference phase. We experimentally find that given a dehazing model trained on\nsynthetic data, by fine-tuning the statistics (i.e., mean and standard\ndeviation) of encoding features, PTTD is able to narrow the domain gap,\nboosting the performance of real image dehazing. Accordingly, we first apply a\nprompt generation module (PGM) to generate a visual prompt, which is the source\nof appropriate statistical perturbations for mean and standard deviation. And\nthen, we employ the feature adaptation module (FAM) into the existing dehazing\nmodels for adjusting the original statistics with the guidance of the generated\nprompt. Note that, PTTD is model-agnostic and can be equipped with various\nstate-of-the-art dehazing models trained on synthetic hazy-clean pairs.\nExtensive experimental results demonstrate that our PTTD is flexible meanwhile\nachieves superior performance against state-of-the-art dehazing methods in\nreal-world scenarios. The source code of our PTTD will be made available at\nhttps://github.com/cecret3350/PTTD-Dehazing.\n","authors":["Zixuan Chen","Zewei He","Ziqian Lu","Xuecheng Sun","Zhe-Ming Lu"],"pdf_url":"https://arxiv.org/pdf/2309.17389v4.pdf","comment":"update github link (https://github.com/cecret3350/PTTD-Dehazing)"},{"id":"http://arxiv.org/abs/2311.13372v2","updated":"2023-11-27T10:42:46Z","published":"2023-11-22T13:13:19Z","title":"MRGazer: Decoding Eye Gaze Points from Functional Magnetic Resonance\n Imaging in Individual Space","summary":" Eye-tracking research has proven valuable in understanding numerous cognitive\nfunctions. Recently, Frey et al. provided an exciting deep learning method for\nlearning eye movements from fMRI data. However, it needed to co-register fMRI\ninto standard space to obtain eyeballs masks, and thus required additional\ntemplates and was time consuming. To resolve this issue, in this paper, we\npropose a framework named MRGazer for predicting eye gaze points from fMRI in\nindividual space. The MRGazer consisted of eyeballs extraction module and a\nresidual network-based eye gaze prediction. Compared to the previous method,\nthe proposed framework skips the fMRI co-registration step, simplifies the\nprocessing protocol and achieves end-to-end eye gaze regression. The proposed\nmethod achieved superior performance in a variety of eye movement tasks than\nthe co-registration-based method, and delivered objective results within a\nshorter time (~ 0.02 Seconds for each volume) than prior method (~0.3 Seconds\nfor each volume).\n","authors":["Xiuwen Wu","Rongjie Hu","Jie Liang","Yanming Wang","Bensheng Qiu","Xiaoxiao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01825v2","updated":"2023-11-27T10:39:13Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":" Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06226v2","updated":"2023-11-27T10:31:09Z","published":"2023-03-10T22:21:30Z","title":"NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering","summary":" Traditional 3D face models are based on mesh representations with texture.\nOne of the most important models is FLAME (Faces Learned with an Articulated\nModel and Expressions), which produces meshes of human faces that are fully\ncontrollable. Unfortunately, such models have problems with capturing geometric\nand appearance details. In contrast to mesh representation, the neural radiance\nfield (NeRF) produces extremely sharp renders. However, implicit methods are\nhard to animate and do not generalize well to unseen expressions. It is not\ntrivial to effectively control NeRF models to obtain face manipulation.\n The present paper proposes a novel approach, named NeRFlame, which combines\nthe strengths of both NeRF and FLAME methods. Our method enables high-quality\nrendering capabilities of NeRF while also offering complete control over the\nvisual appearance, similar to FLAME. In contrast to traditional NeRF-based\nstructures that use neural networks for RGB color and volume density modeling,\nour approach utilizes the FLAME mesh as a distinct density volume.\nConsequently, color values exist only in the vicinity of the FLAME mesh. This\nFLAME framework is seamlessly incorporated into the NeRF architecture for\npredicting RGB colors, enabling our model to explicitly represent volume\ndensity and implicitly capture RGB colors.\n","authors":["Wojciech Zając","Joanna Waczyńska","Piotr Borycki","Jacek Tabor","Maciej Zięba","Przemysław Spurek"],"pdf_url":"https://arxiv.org/pdf/2303.06226v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15679v1","updated":"2023-11-27T10:10:25Z","published":"2023-11-27T10:10:25Z","title":"Model-agnostic Body Part Relevance Assessment for Pedestrian Detection","summary":" Model-agnostic explanation methods for deep learning models are flexible\nregarding usability and availability. However, due to the fact that they can\nonly manipulate input to see changes in output, they suffer from weak\nperformance when used with complex model architectures. For models with large\ninputs as, for instance, in object detection, sampling-based methods like\nKernelSHAP are inefficient due to many computation-heavy forward passes through\nthe model. In this work, we present a framework for using sampling-based\nexplanation models in a computer vision context by body part relevance\nassessment for pedestrian detection. Furthermore, we introduce a novel\nsampling-based method similar to KernelSHAP that shows more robustness for\nlower sampling sizes and, thus, is more efficient for explainability analyses\non large-scale datasets.\n","authors":["Maurice Günder","Sneha Banerjee","Rafet Sifa","Christian Bauckhage"],"pdf_url":"https://arxiv.org/pdf/2311.15679v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15672v1","updated":"2023-11-27T10:01:31Z","published":"2023-11-27T10:01:31Z","title":"HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images","summary":" As for human avatar reconstruction, contemporary techniques commonly\nnecessitate the acquisition of costly data and struggle to achieve satisfactory\nresults from a small number of casual images. In this paper, we investigate\nthis task from a few-shot unconstrained photo album. The reconstruction of\nhuman avatars from such data sources is challenging because of limited data\namount and dynamic articulated poses. For handling dynamic data, we integrate a\nskinning mechanism with deep marching tetrahedra (DMTet) to form a drivable\ntetrahedral representation, which drives arbitrary mesh topologies generated by\nthe DMTet for the adaptation of unconstrained images. To effectively mine\ninstructive information from few-shot data, we devise a two-phase optimization\nmethod with few-shot reference and few-shot guidance. The former focuses on\naligning avatar identity with reference images, while the latter aims to\ngenerate plausible appearances for unseen regions. Overall, our framework,\ncalled HaveFun, can undertake avatar reconstruction, rendering, and animation.\nExtensive experiments on our developed benchmarks demonstrate that HaveFun\nexhibits substantially superior performance in reconstructing the human body\nand hand. Project website: https://seanchenxy.github.io/HaveFunWeb/.\n","authors":["Xihe Yang","Xingyu Chen","Shaohui Wang","Daiheng Gao","Xiaoguang Han","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15668v1","updated":"2023-11-27T09:55:55Z","published":"2023-11-27T09:55:55Z","title":"Deformation-Guided Unsupervised Non-Rigid Shape Matching","summary":" We present an unsupervised data-driven approach for non-rigid shape matching.\nShape matching identifies correspondences between two shapes and is a\nfundamental step in many computer vision and graphics applications. Our\napproach is designed to be particularly robust when matching shapes digitized\nusing 3D scanners that contain fine geometric detail and suffer from different\ntypes of noise including topological noise caused by the coalescence of\nspatially close surface regions. We build on two strategies. First, using a\nhierarchical patch based shape representation we match shapes consistently in a\ncoarse to fine manner, allowing for robustness to noise. This multi-scale\nrepresentation drastically reduces the dimensionality of the problem when\nmatching at the coarsest scale, rendering unsupervised learning feasible.\nSecond, we constrain this hierarchical matching to be reflected in 3D by\nfitting a patch-wise near-rigid deformation model. Using this constraint, we\nleverage spatial continuity at different scales to capture global shape\nproperties, resulting in matchings that generalize well to data with different\ndeformations and noise characteristics. Experiments demonstrate that our\napproach obtains significantly better results on raw 3D scans than\nstate-of-the-art methods, while performing on-par on standard test scenarios.\n","authors":["Aymen Merrouche","Joao Regateiro","Stefanie Wuhrer","Edmond Boyer"],"pdf_url":"https://arxiv.org/pdf/2311.15668v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13959v2","updated":"2023-11-27T09:47:10Z","published":"2023-11-23T12:17:45Z","title":"RankFeat&RankWeight: Rank-1 Feature/Weight Removal for\n Out-of-distribution Detection","summary":" The task of out-of-distribution (OOD) detection is crucial for deploying\nmachine learning models in real-world settings. In this paper, we observe that\nthe singular value distributions of the in-distribution (ID) and OOD features\nare quite different: the OOD feature matrix tends to have a larger dominant\nsingular value than the ID feature, and the class predictions of OOD samples\nare largely determined by it. This observation motivates us to propose\n\\texttt{RankFeat}, a simple yet effective \\emph{post hoc} approach for OOD\ndetection by removing the rank-1 matrix composed of the largest singular value\nand the associated singular vectors from the high-level feature.\n\\texttt{RankFeat} achieves \\emph{state-of-the-art} performance and reduces the\naverage false positive rate (FPR95) by 17.90\\% compared with the previous best\nmethod. The success of \\texttt{RankFeat} motivates us to investigate whether a\nsimilar phenomenon would exist in the parameter matrices of neural networks. We\nthus propose \\texttt{RankWeight} which removes the rank-1 weight from the\nparameter matrices of a single deep layer. Our \\texttt{RankWeight}is also\n\\emph{post hoc} and only requires computing the rank-1 matrix once. As a\nstandalone approach, \\texttt{RankWeight} has very competitive performance\nagainst other methods across various backbones. Moreover, \\texttt{RankWeight}\nenjoys flexible compatibility with a wide range of OOD detection methods. The\ncombination of \\texttt{RankWeight} and \\texttt{RankFeat} refreshes the new\n\\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\\% on\nthe ImageNet-1k benchmark. Extensive ablation studies and comprehensive\ntheoretical analyses are presented to support the empirical results.\n","authors":["Yue Song","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13959v2.pdf","comment":"submitted to T-PAMI. arXiv admin note: substantial text overlap with\n arXiv:2209.08590"},{"id":"http://arxiv.org/abs/2311.15660v1","updated":"2023-11-27T09:40:53Z","published":"2023-11-27T09:40:53Z","title":"Technical Report for Argoverse Challenges on 4D Occupancy Forecasting","summary":" This report presents our Le3DE2E_Occ solution for 4D Occupancy Forecasting in\nArgoverse Challenges at CVPR 2023 Workshop on Autonomous Driving (WAD). Our\nsolution consists of a strong LiDAR-based Bird's Eye View (BEV) encoder with\ntemporal fusion and a two-stage decoder, which combines a DETR head and a UNet\ndecoder. The solution was tested on the Argoverse 2 sensor dataset to evaluate\nthe occupancy state 3 seconds in the future. Our solution achieved 18% lower L1\nError (3.57) than the baseline and got the 1 place on the 4D Occupancy\nForecasting task in Argoverse Challenges at CVPR 2023.\n","authors":["Pengfei Zheng","Kanokphan Lertniphonphan","Feng Chen","Siwei Chen","Bingchuan Sun","Jun Xie","Zhepeng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15658v1","updated":"2023-11-27T09:40:14Z","published":"2023-11-27T09:40:14Z","title":"Regularization by Texts for Latent Diffusion Inverse Solvers","summary":" The recent advent of diffusion models has led to significant progress in\nsolving inverse problems, leveraging these models as effective generative\npriors. Nonetheless, challenges related to the ill-posed nature of such\nproblems remain, often due to inherent ambiguities in measurements. Drawing\ninspiration from the human ability to resolve visual ambiguities through\nperceptual biases, here we introduce a novel latent diffusion inverse solver by\nincorporating regularization by texts (TReg). Specifically, TReg applies the\ntextual description of the preconception of the solution during the reverse\nsampling phase, of which description isndynamically reinforced through\nnull-text optimization for adaptive negation. Our comprehensive experimental\nresults demonstrate that TReg successfully mitigates ambiguity in latent\ndiffusion inverse solvers, enhancing their effectiveness and accuracy.\n","authors":["Jeongsol Kim","Geon Yeong Park","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15657v1","updated":"2023-11-27T09:39:45Z","published":"2023-11-27T09:39:45Z","title":"Enhancing Diffusion Models with Text-Encoder Reinforcement Learning","summary":" Text-to-image diffusion models are typically trained to optimize the\nlog-likelihood objective, which presents challenges in meeting specific\nrequirements for downstream tasks, such as image aesthetics and image-text\nalignment. Recent research addresses this issue by refining the diffusion U-Net\nusing human rewards through reinforcement learning or direct backpropagation.\nHowever, many of them overlook the importance of the text encoder, which is\ntypically pretrained and fixed during training. In this paper, we demonstrate\nthat by finetuning the text encoder through reinforcement learning, we can\nenhance the text-image alignment of the results, thereby improving the visual\nquality. Our primary motivation comes from the observation that the current\ntext encoder is suboptimal, often requiring careful prompt adjustment. While\nfine-tuning the U-Net can partially improve performance, it remains suffering\nfrom the suboptimal text encoder. Therefore, we propose to use reinforcement\nlearning with low-rank adaptation to finetune the text encoder based on\ntask-specific rewards, referred as \\textbf{TexForce}. We first show that\nfinetuning the text encoder can improve the performance of diffusion models.\nThen, we illustrate that TexForce can be simply combined with existing U-Net\nfinetuned models to get much better results without additional training.\nFinally, we showcase the adaptability of our method in diverse applications,\nincluding the generation of high-quality face and hand images.\n","authors":["Chaofeng Chen","Annan Wang","Haoning Wu","Liang Liao","Wenxiu Sun","Qiong Yan","Weisi Lin"],"pdf_url":"https://arxiv.org/pdf/2311.15657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15648v1","updated":"2023-11-27T09:20:12Z","published":"2023-11-27T09:20:12Z","title":"Reinforcement Learning from Diffusion Feedback: Q* for Image Search","summary":" Large vision-language models are steadily gaining personalization\ncapabilities at the cost of fine-tuning or data augmentation. We present two\nmodels for image generation using model-agnostic learning that align semantic\npriors with generative capabilities. RLDF, or Reinforcement Learning from\nDiffusion Feedback, is a singular approach for visual imitation through\nprior-preserving reward function guidance. This employs Q-learning (with\nstandard Q*) for generation and follows a semantic-rewarded trajectory for\nimage search through finite encoding-tailored actions. The second proposed\nmethod, noisy diffusion gradient, is optimization driven. At the root of both\nmethods is a special CFG encoding that we propose for continual semantic\nguidance. Using only a single input image and no text input, RLDF generates\nhigh-quality images over varied domains including retail, sports and\nagriculture showcasing class-consistency and strong visual diversity. Project\nwebsite is available at https://infernolia.github.io/RLDF.\n","authors":["Aboli Marathe"],"pdf_url":"https://arxiv.org/pdf/2311.15648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14969v2","updated":"2023-11-27T09:06:55Z","published":"2023-08-29T01:47:49Z","title":"Uncovering the Hidden Cost of Model Compression","summary":" In the era of resource-intensive foundation models, efficient adaptation in\ndownstream tasks has become paramount. Visual Prompting (VP), inspired by\nprompting in Large Language Models (LLMs), has emerged as a key transfer\nlearning method in computer vision. Aligned with the growing significance of\nefficiency, research in model compression has become pivotal to alleviate the\ncomputational burden in both training and deploying over-parameterized neural\nnetworks. A key goal in model compression is the development of sparse models\ncapable of matching or surpassing the performance of their over-parameterized,\ndense counterparts. While prior research has explored the impact of model\nsparsity on transfer learning, its effects on visual prompting-based transfer\nremain unclear. This study addresses this gap, revealing that model sparsity\nadversely affects the performance of visual prompting-based transfer,\nparticularly in low-data-volume scenarios. Furthermore, our findings highlight\nthe negative influence of sparsity on the calibration of downstream\nvisual-prompted models. This empirical exploration calls for a nuanced\nunderstanding beyond accuracy in sparse settings, opening avenues for further\nresearch in Visual Prompting for sparse models. Code and logs can be accessed\nat https://github.com/landskape-ai/Reprogram_LT .\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2311.15637v1","updated":"2023-11-27T09:02:21Z","published":"2023-11-27T09:02:21Z","title":"PaintNeSF: Artistic Creation of Stylized Scenes with Vectorized 3D\n Strokes","summary":" We present Paint Neural Stroke Field (PaintNeSF), a novel technique to\ngenerate stylized images of a 3D scene at arbitrary novel views from multi-view\n2D images. Different from existing methods which apply stylization to trained\nneural radiance fields at the voxel level, our approach draws inspiration from\nimage-to-painting methods, simulating the progressive painting process of human\nartwork with vector strokes. We develop a palette of stylized 3D strokes from\nbasic primitives and splines, and consider the 3D scene stylization task as a\nmulti-view reconstruction process based on these 3D stroke primitives. Instead\nof directly searching for the parameters of these 3D strokes, which would be\ntoo costly, we introduce a differentiable renderer that allows optimizing\nstroke parameters using gradient descent, and propose a training scheme to\nalleviate the vanishing gradient issue. The extensive evaluation demonstrates\nthat our approach effectively synthesizes 3D scenes with significant geometric\nand aesthetic stylization while maintaining a consistent appearance across\ndifferent views. Our method can be further integrated with style loss and\nimage-text contrastive models to extend its applications, including color\ntransfer and text-driven 3D scene drawing.\n","authors":["Hao-Bin Duan","Miao Wang","Yan-Xun Li","Yong-Liang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15637v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00397v2","updated":"2023-11-27T09:02:06Z","published":"2023-11-01T09:46:59Z","title":"Towards Omni-supervised Referring Expression Segmentation","summary":" Referring Expression Segmentation (RES) is an emerging task in computer\nvision, which segments the target instances in images based on text\ndescriptions. However, its development is plagued by the expensive segmentation\nlabels. To address this issue, we propose a new learning task for RES called\nOmni-supervised Referring Expression Segmentation (Omni-RES), which aims to\nmake full use of unlabeled, fully labeled and weakly labeled data, e.g.,\nreferring points or grounding boxes, for efficient RES training. To accomplish\nthis task, we also propose a novel yet strong baseline method for Omni-RES\nbased on the recently popular teacher-student learning, where the weak labels\nare not directly transformed into supervision signals but used as a yardstick\nto select and refine high-quality pseudo-masks for teacher-student learning. To\nvalidate the proposed Omni-RES method, we apply it to a set of state-of-the-art\nRES models and conduct extensive experiments on a bunch of RES datasets. The\nexperimental results yield the obvious merits of Omni-RES than the\nfully-supervised and semi-supervised training schemes. For instance, with only\n10% fully labeled data, Omni-RES can help the base model achieve 100% fully\nsupervised performance, and it also outperform the semi-supervised alternative\nby a large margin, e.g., +14.93% on RefCOCO and +14.95% on RefCOCO+,\nrespectively. More importantly, Omni-RES also enable the use of large-scale\nvision-langauges like Visual Genome to facilitate low-cost RES training, and\nachieve new SOTA performance of RES, e.g., 80.66 on RefCOCO.\n","authors":["Minglang Huang","Yiyi Zhou","Gen Luo","Guannan Jiang","Weilin Zhuang","Xiaoshuai Sun"],"pdf_url":"https://arxiv.org/pdf/2311.00397v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15625v1","updated":"2023-11-27T08:44:00Z","published":"2023-11-27T08:44:00Z","title":"Only Positive Cases: 5-fold High-order Attention Interaction Model for\n Skin Segmentation Derived Classification","summary":" Computer-aided diagnosis of skin diseases is an important tool. However, the\ninterpretability of computer-aided diagnosis is currently poor. Dermatologists\nand patients cannot intuitively understand the learning and prediction process\nof neural networks, which will lead to a decrease in the credibility of\ncomputer-aided diagnosis. In addition, traditional methods need to be trained\nusing negative samples in order to predict the presence or absence of a lesion,\nbut medical data is often in short supply. In this paper, we propose a multiple\nhigh-order attention interaction model (MHA-UNet) for use in a highly\nexplainable skin lesion segmentation task. MHA-UNet is able to obtain the\npresence or absence of a lesion by explainable reasoning without the need for\ntraining on negative samples. Specifically, we propose a high-order attention\ninteraction mechanism that introduces squeeze attention to a higher level for\nfeature attention. In addition, a multiple high-order attention interaction\n(MHAblock) module is proposed by combining the different features of different\norders. For classifying the presence or absence of lesions, we conducted\nclassification experiments on several publicly available datasets in the\nabsence of negative samples, based on explainable reasoning about the\ninteraction of 5 attention orders of MHAblock. The highest positive detection\nrate obtained from the experiments was 81.0% and the highest negative detection\nrate was 83.5%. For segmentation experiments, comparison experiments of the\nproposed method with 13 medical segmentation models and external validation\nexperiments with 8 state-of-the-art models in three public datasets and our\nclinical dataset demonstrate the state-of-the-art performance of our model. The\ncode is available from https://github.com/wurenkai/MHA-UNet.\n","authors":["Renkai Wu","Yinghao Liu","Pengchen Liang","Qing Chang"],"pdf_url":"https://arxiv.org/pdf/2311.15625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08872v5","updated":"2023-11-27T08:42:07Z","published":"2023-10-13T05:48:42Z","title":"R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image\n Generation","summary":" Recent text-to-image (T2I) diffusion models have achieved remarkable progress\nin generating high-quality images given text-prompts as input. However, these\nmodels fail to convey appropriate spatial composition specified by a layout\ninstruction. In this work, we probe into zero-shot grounded T2I generation with\ndiffusion models, that is, generating images corresponding to the input layout\ninformation without training auxiliary modules or finetuning diffusion models.\nWe propose a Region and Boundary (R&B) aware cross-attention guidance approach\nthat gradually modulates the attention maps of diffusion model during\ngenerative process, and assists the model to synthesize images (1) with high\nfidelity, (2) highly compatible with textual input, and (3) interpreting layout\ninstructions accurately. Specifically, we leverage the discrete sampling to\nbridge the gap between consecutive attention maps and discrete layout\nconstraints, and design a region-aware loss to refine the generative layout\nduring diffusion process. We further propose a boundary-aware loss to\nstrengthen object discriminability within the corresponding regions.\nExperimental results show that our method outperforms existing state-of-the-art\nzero-shot grounded T2I generation methods by a large margin both qualitatively\nand quantitatively on several benchmarks.\n","authors":["Jiayu Xiao","Henglei Lv","Liang Li","Shuhui Wang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2310.08872v5.pdf","comment":"Preprint. Under review. Project page:\n https://sagileo.github.io/Region-and-Boundary"},{"id":"http://arxiv.org/abs/2311.15619v1","updated":"2023-11-27T08:32:28Z","published":"2023-11-27T08:32:28Z","title":"Align before Adapt: Leveraging Entity-to-Region Alignments for\n Generalizable Video Action Recognition","summary":" Large-scale visual-language pre-trained models have achieved significant\nsuccess in various video tasks. However, most existing methods follow an \"adapt\nthen align\" paradigm, which adapts pre-trained image encoders to model\nvideo-level representations and utilizes one-hot or text embedding of the\naction labels for supervision. This paradigm overlooks the challenge of mapping\nfrom static images to complicated activity concepts. In this paper, we propose\na novel \"Align before Adapt\" (ALT) paradigm. Prior to adapting to video\nrepresentation learning, we exploit the entity-to-region alignments for each\nframe. The alignments are fulfilled by matching the region-aware image\nembeddings to an offline-constructed text corpus. With the aligned entities, we\nfeed their text embeddings to a transformer-based video adapter as the queries,\nwhich can help extract the semantics of the most important entities from a\nvideo to a vector. This paradigm reuses the visual-language alignment of VLP\nduring adaptation and tries to explain an action by the underlying entities.\nThis helps understand actions by bridging the gap with complex activity\nsemantics, particularly when facing unfamiliar or unseen categories. ALT\nachieves competitive performance and superior generalizability while requiring\nsignificantly low computational costs. In fully supervised scenarios, it\nachieves 88.1% top-1 accuracy on Kinetics-400 with only 4947 GFLOPs. In 2-shot\nexperiments, ALT outperforms the previous state-of-the-art by 7.1% and 9.2% on\nHMDB-51 and UCF-101, respectively.\n","authors":["Yifei Chen","Dapeng Chen","Ruijin Liu","Sai Zhou","Wenyuan Xue","Wei Peng"],"pdf_url":"https://arxiv.org/pdf/2311.15619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12092v2","updated":"2023-11-27T08:29:54Z","published":"2023-11-20T18:59:01Z","title":"Concept Sliders: LoRA Adaptors for Precise Control in Diffusion Models","summary":" We present a method to create interpretable concept sliders that enable\nprecise control over attributes in image generations from diffusion models. Our\napproach identifies a low-rank parameter direction corresponding to one concept\nwhile minimizing interference with other attributes. A slider is created using\na small set of prompts or sample images; thus slider directions can be created\nfor either textual or visual concepts. Concept Sliders are plug-and-play: they\ncan be composed efficiently and continuously modulated, enabling precise\ncontrol over image generation. In quantitative experiments comparing to\nprevious editing techniques, our sliders exhibit stronger targeted edits with\nlower interference. We showcase sliders for weather, age, styles, and\nexpressions, as well as slider compositions. We show how sliders can transfer\nlatents from StyleGAN for intuitive editing of visual concepts for which\ntextual description is difficult. We also find that our method can help address\npersistent quality issues in Stable Diffusion XL including repair of object\ndeformations and fixing distorted hands. Our code, data, and trained sliders\nare available at https://sliders.baulab.info/\n","authors":["Rohit Gandikota","Joanna Materzynska","Tingrui Zhou","Antonio Torralba","David Bau"],"pdf_url":"https://arxiv.org/pdf/2311.12092v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15615v1","updated":"2023-11-27T08:25:23Z","published":"2023-11-27T08:25:23Z","title":"Technical Report for Argoverse Challenges on Unified Sensor-based\n Detection, Tracking, and Forecasting","summary":" This report presents our Le3DE2E solution for unified sensor-based detection,\ntracking, and forecasting in Argoverse Challenges at CVPR 2023 Workshop on\nAutonomous Driving (WAD). We propose a unified network that incorporates three\ntasks, including detection, tracking, and forecasting. This solution adopts a\nstrong Bird's Eye View (BEV) encoder with spatial and temporal fusion and\ngenerates unified representations for multi-tasks. The solution was tested in\nthe Argoverse 2 sensor dataset to evaluate the detection, tracking, and\nforecasting of 26 object categories. We achieved 1st place in Detection,\nTracking, and Forecasting on the E2E Forecasting track in Argoverse Challenges\nat CVPR 2023 WAD.\n","authors":["Zhepeng Wang","Feng Chen","Kanokphan Lertniphonphan","Siwei Chen","Jinyao Bao","Pengfei Zheng","Jinbao Zhang","Kaer Huang","Tao Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15615v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.05646v2","updated":"2023-11-27T08:12:14Z","published":"2023-04-12T06:49:56Z","title":"Breaking Modality Disparity: Harmonized Representation for Infrared and\n Visible Image Registration","summary":" Since the differences in viewing range, resolution and relative position, the\nmulti-modality sensing module composed of infrared and visible cameras needs to\nbe registered so as to have more accurate scene perception. In practice, manual\ncalibration-based registration is the most widely used process, and it is\nregularly calibrated to maintain accuracy, which is time-consuming and\nlabor-intensive. To cope with these problems, we propose a scene-adaptive\ninfrared and visible image registration. Specifically, in regard of the\ndiscrepancy between multi-modality images, an invertible translation process is\ndeveloped to establish a modality-invariant domain, which comprehensively\nembraces the feature intensity and distribution of both infrared and visible\nmodalities. We employ homography to simulate the deformation between different\nplanes and develop a hierarchical framework to rectify the deformation inferred\nfrom the proposed latent representation in a coarse-to-fine manner. For that,\nthe advanced perception ability coupled with the residual estimation conducive\nto the regression of sparse offsets, and the alternate correlation search\nfacilitates a more accurate correspondence matching. Moreover, we propose the\nfirst ground truth available misaligned infrared and visible image dataset,\ninvolving three synthetic sets and one real-world set. Extensive experiments\nvalidate the effectiveness of the proposed method against the\nstate-of-the-arts, advancing the subsequent applications.\n","authors":["Zhiying Jiang","Zengxi Zhang","Jinyuan Liu","Xin Fan","Risheng Liu"],"pdf_url":"https://arxiv.org/pdf/2304.05646v2.pdf","comment":"10 pages, 11 figures"},{"id":"http://arxiv.org/abs/2311.15609v1","updated":"2023-11-27T08:06:56Z","published":"2023-11-27T08:06:56Z","title":"A manometric feature descriptor with linear-SVM to distinguish\n esophageal contraction vigor","summary":" n clinical, if a patient presents with nonmechanical obstructive dysphagia,\nesophageal chest pain, and gastro esophageal reflux symptoms, the physician\nwill usually assess the esophageal dynamic function. High-resolution manometry\n(HRM) is a clinically commonly used technique for detection of esophageal\ndynamic function comprehensively and objectively. However, after the results of\nHRM are obtained, doctors still need to evaluate by a variety of parameters.\nThis work is burdensome, and the process is complex. We conducted image\nprocessing of HRM to predict the esophageal contraction vigor for assisting the\nevaluation of esophageal dynamic function. Firstly, we used Feature-Extraction\nand Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow\n(PoS) to further extract higher-order features. Then we determine the\nclassification of esophageal contraction vigor normal, weak and failed by using\nlinear-SVM according to these features. Our data set includes 3000 training\nsets, 500 validation sets and 411 test sets. After verification our accuracy\nreaches 86.83%, which is higher than other common machine learning methods.\n","authors":["Jialin Liu","Lu Yan","Xiaowei Liu","Yuzhuo Dai","Fanggen Lu","Yuanting Ma","Muzhou Hou","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15607v1","updated":"2023-11-27T08:00:53Z","published":"2023-11-27T08:00:53Z","title":"Spatially Covariant Image Registration with Text Prompts","summary":" Medical images are often characterized by their structured anatomical\nrepresentations and spatially inhomogeneous contrasts. Leveraging anatomical\npriors in neural networks can greatly enhance their utility in\nresource-constrained clinical settings. Prior research has harnessed such\ninformation for image segmentation, yet progress in deformable image\nregistration has been modest. Our work introduces textSCF, a novel method that\nintegrates spatially covariant filters and textual anatomical prompts encoded\nby visual-language models, to fill this gap. This approach optimizes an\nimplicit function that correlates text embeddings of anatomical regions to\nfilter weights, relaxing the typical translation-invariance constraint of\nconvolutional operations. TextSCF not only boosts computational efficiency but\ncan also retain or improve registration accuracy. By capturing the contextual\ninterplay between anatomical regions, it offers impressive inter-regional\ntransferability and the ability to preserve structural discontinuities during\nregistration. TextSCF's performance has been rigorously tested on inter-subject\nbrain MRI and abdominal CT registration tasks, outperforming existing\nstate-of-the-art models in the MICCAI Learn2Reg 2021 challenge and leading the\nleaderboard. In abdominal registrations, textSCF's larger model variant\nimproved the Dice score by 11.3% over the second-best model, while its smaller\nvariant maintained similar accuracy but with an 89.13% reduction in network\nparameters and a 98.34\\% decrease in computational operations.\n","authors":["Hang Zhang","Xiang Chen","Rongguang Wang","Renjiu Hu","Dongdong Liu","Gaolei Li"],"pdf_url":"https://arxiv.org/pdf/2311.15607v1.pdf","comment":"15 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.15605v1","updated":"2023-11-27T07:57:29Z","published":"2023-11-27T07:57:29Z","title":"2D Feature Distillation for Weakly- and Semi-Supervised 3D Semantic\n Segmentation","summary":" As 3D perception problems grow in popularity and the need for large-scale\nlabeled datasets for LiDAR semantic segmentation increase, new methods arise\nthat aim to reduce the necessity for dense annotations by employing\nweakly-supervised training. However these methods continue to show weak\nboundary estimation and high false negative rates for small objects and distant\nsparse regions. We argue that such weaknesses can be compensated by using RGB\nimages which provide a denser representation of the scene. We propose an\nimage-guidance network (IGNet) which builds upon the idea of distilling high\nlevel feature information from a domain adapted synthetically trained 2D\nsemantic segmentation network. We further utilize a one-way contrastive\nlearning scheme alongside a novel mixing strategy called FOVMix, to combat the\nhorizontal field-of-view mismatch between the two sensors and enhance the\neffects of image guidance. IGNet achieves state-of-the-art results for\nweakly-supervised LiDAR semantic segmentation on ScribbleKITTI, boasting up to\n98% relative performance to fully supervised training with only 8% labeled\npoints, while introducing no additional annotation burden or\ncomputational/memory cost during inference. Furthermore, we show that our\ncontributions also prove effective for semi-supervised training, where IGNet\nclaims state-of-the-art results on both ScribbleKITTI and SemanticKITTI.\n","authors":["Ozan Unal","Dengxin Dai","Lukas Hoyer","Yigit Baran Can","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2311.15605v1.pdf","comment":"Accepted at WACV 2024"},{"id":"http://arxiv.org/abs/2311.15599v1","updated":"2023-11-27T07:48:50Z","published":"2023-11-27T07:48:50Z","title":"UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio,\n Video, Point Cloud, Time-Series and Image Recognition","summary":" Large-kernel convolutional neural networks (ConvNets) have recently received\nextensive research attention, but there are two unresolved and critical issues\nthat demand further investigation. 1) The architectures of existing\nlarge-kernel ConvNets largely follow the design principles of conventional\nConvNets or transformers, while the architectural design for large-kernel\nConvNets remains under-addressed. 2) As transformers have dominated multiple\nmodalities, it remains to be investigated whether ConvNets also have a strong\nuniversal perception ability in domains beyond vision. In this paper, we\ncontribute from two aspects. 1) We propose four architectural guidelines for\ndesigning large-kernel ConvNets, the core of which is to exploit the essential\ncharacteristics of large kernels that distinguish them from small kernels -\nthey can see wide without going deep. Following such guidelines, our proposed\nlarge-kernel ConvNet shows leading performance in image recognition. For\nexample, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of\n55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher\nspeed than a number of recently proposed powerful competitors. 2) We discover\nthat large kernels are the key to unlocking the exceptional performance of\nConvNets in domains where they were originally not proficient. With certain\nmodality-related preprocessing approaches, the proposed model achieves\nstate-of-the-art performance on time-series forecasting and audio recognition\ntasks even without modality-specific customization to the architecture. Code\nand all the models at https://github.com/AILab-CVC/UniRepLKNet.\n","authors":["Xiaohan Ding","Yiyuan Zhang","Yixiao Ge","Sijie Zhao","Lin Song","Xiangyu Yue","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.15599v1.pdf","comment":"Code, all the models and reproducible training scripts at\n https://github.com/AILab-CVC/UniRepLKNet"},{"id":"http://arxiv.org/abs/2311.15596v1","updated":"2023-11-27T07:44:25Z","published":"2023-11-27T07:44:25Z","title":"Can Vision-Language Models Think from a First-Person Perspective?","summary":" Vision-language models (VLMs) have recently shown promising results in\ntraditional downstream tasks. Evaluation studies have emerged to assess their\nabilities, with the majority focusing on the third-person perspective, and only\na few addressing specific tasks from the first-person perspective. However, the\ncapability of VLMs to \"think\" from a first-person perspective, a crucial\nattribute for advancing autonomous agents and robotics, remains largely\nunexplored. To bridge this research gap, we introduce EgoThink, a novel visual\nquestion-answering benchmark that encompasses six core capabilities with twelve\ndetailed dimensions. The benchmark is constructed using selected clips from\negocentric videos, with manually annotated question-answer pairs containing\nfirst-person information. To comprehensively assess VLMs, we evaluate eighteen\npopular VLMs on EgoThink. Moreover, given the open-ended format of the answers,\nwe use GPT-4 as the automatic judge to compute single-answer grading.\nExperimental results indicate that although GPT-4V leads in numerous\ndimensions, all evaluated VLMs still possess considerable potential for\nimprovement in first-person perspective tasks. Meanwhile, enlarging the number\nof trainable parameters has the most significant impact on model performance on\nEgoThink. In conclusion, EgoThink serves as a valuable addition to existing\nevaluation benchmarks for VLMs, providing an indispensable resource for future\nresearch in the realm of embodied artificial intelligence and robotics.\n","authors":["Sijie Cheng","Zhicheng Guo","Jingwen Wu","Kechen Fang","Peng Li","Huaping Liu","Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15586v1","updated":"2023-11-27T07:24:50Z","published":"2023-11-27T07:24:50Z","title":"An Ensemble of 2.5D ResUnet Based Models for Segmentation for Kidney and\n Masses","summary":" The automatic segmentation of kidney, kidney tumor and kidney cyst on\nComputed Tomography (CT) scans is a challenging task due to the indistinct\nlesion boundaries and fuzzy texture. Considering the large range and unbalanced\ndistribution of CT scans' thickness, 2.5D ResUnet are adopted to build an\nefficient coarse-to-fine semantic segmentation framework in this work. A set of\n489 CT scans are used for training and validation, and an independent\nnever-before-used CT scans for testing. Finally, we demonstrate the\neffectiveness of our proposed method. The dice values on test set are 0.954,\n0.792, 0.691, the surface dice values are 0.897, 0.591, 0.541 for kidney, tumor\nand cyst, respectively. The average inference time of each CT scan is 20.65s\nand the max GPU memory is 3525MB. The results suggest that a better trade-off\nbetween model performance and efficiency.\n","authors":["Cancan Chen"," RongguoZhang"],"pdf_url":"https://arxiv.org/pdf/2311.15586v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.15584v1","updated":"2023-11-27T07:19:41Z","published":"2023-11-27T07:19:41Z","title":"A deep learning approach for marine snow synthesis and removal","summary":" Marine snow, the floating particles in underwater images, severely degrades\nthe visibility and performance of human and machine vision systems. This paper\nproposes a novel method to reduce the marine snow interference using deep\nlearning techniques. We first synthesize realistic marine snow samples by\ntraining a Generative Adversarial Network (GAN) model and combine them with\nnatural underwater images to create a paired dataset. We then train a U-Net\nmodel to perform marine snow removal as an image to image translation task. Our\nexperiments show that the U-Net model can effectively remove both synthetic and\nnatural marine snow with high accuracy, outperforming state-of-the-art methods\nsuch as the Median filter and its adaptive variant. We also demonstrate the\nrobustness of our method by testing it on the MSRB dataset, which contains\nsynthetic artifacts that our model has not seen during training. Our method is\na practical and efficient solution for enhancing underwater images affected by\nmarine snow.\n","authors":["Fernando Galetto","Guang Deng"],"pdf_url":"https://arxiv.org/pdf/2311.15584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15581v1","updated":"2023-11-27T07:19:10Z","published":"2023-11-27T07:19:10Z","title":"Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras\n from Wide-Angle Monocular Video Recordings","summary":" Eliminating time-consuming post-production processes and delivering\nhigh-quality videos in today's fast-paced digital landscape are the key\nadvantages of real-time approaches. To address these needs, we present Real\nTime GAZED: a real-time adaptation of the GAZED framework integrated with\nCineFilter, a novel real-time camera trajectory stabilization approach. It\nenables users to create professionally edited videos in real-time. Comparative\nevaluations against baseline methods, including the non-real-time GAZED,\ndemonstrate that Real Time GAZED achieves similar editing results, ensuring\nhigh-quality video output. Furthermore, a user study confirms the aesthetic\nquality of the video edits produced by the Real Time GAZED approach. With these\nadvancements in real-time camera trajectory optimization and video editing\npresented, the demand for immediate and dynamic content creation in industries\nsuch as live broadcasting, sports coverage, news reporting, and social media\ncontent creation can be met more efficiently.\n","authors":["Sudheer Achary","Rohit Girmaji","Adhiraj Anil Deshmukh","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2311.15581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15573v1","updated":"2023-11-27T06:55:53Z","published":"2023-11-27T06:55:53Z","title":"EucliDreamer: Fast and High-Quality Texturing for 3D Models with Stable\n Diffusion Depth","summary":" This paper presents a novel method to generate textures for 3D models given\ntext prompts and 3D meshes. Additional depth information is taken into account\nto perform the Score Distillation Sampling (SDS) process [28] with depth\nconditional Stable Diffusion [34]. We ran our model over the open-source\ndataset Objaverse [7] and conducted a user study to compare the results with\nthose of various 3D texturing methods. We have shown that our model can\ngenerate more satisfactory results and produce various art styles for the same\nobject. In addition, we achieved faster time when generating textures of\ncomparable quality. We also conduct thorough ablation studies of how different\nfactors may affect generation quality, including sampling steps, guidance\nscale, negative prompts, data augmentation, elevation range, and alternatives\nto SDS.\n","authors":["Cindy Le","Congrui Hetang","Ang Cao","Yihui He"],"pdf_url":"https://arxiv.org/pdf/2311.15573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11317v3","updated":"2023-11-27T06:52:44Z","published":"2023-11-19T13:07:06Z","title":"Discrete approximations of Gaussian smoothing and Gaussian derivatives","summary":" This paper develops an in-depth treatment concerning the problem of\napproximating the Gaussian smoothing and Gaussian derivative computations in\nscale-space theory for application on discrete data. With close connections to\nprevious axiomatic treatments of continuous and discrete scale-space theory, we\nconsider three main ways discretizing these scale-space operations in terms of\nexplicit discrete convolutions, based on either (i) sampling the Gaussian\nkernels and the Gaussian derivative kernels, (ii) locally integrating the\nGaussian kernels and the Gaussian derivative kernels over each pixel support\nregion and (iii) basing the scale-space analysis on the discrete analogue of\nthe Gaussian kernel, and then computing derivative approximations by applying\nsmall-support central difference operators to the spatially smoothed image\ndata.\n We study the properties of these three main discretization methods both\ntheoretically and experimentally, and characterize their performance by\nquantitative measures, including the results they give rise to with respect to\nthe task of scale selection, investigated for four different use cases, and\nwith emphasis on the behaviour at fine scales. The results show that the\nsampled Gaussian kernels and derivatives as well as the integrated Gaussian\nkernels and derivatives perform very poorly at very fine scales. At very fine\nscales, the discrete analogue of the Gaussian kernel with its corresponding\ndiscrete derivative approximations performs substantially better. The sampled\nGaussian kernel and the sampled Gaussian derivatives do, on the other hand,\nlead to numerically very good approximations of the corresponding continuous\nresults, when the scale parameter is sufficiently large, in the experiments\npresented in the paper, when the scale parameter is greater than a value of\nabout 1, in units of the grid spacing.\n","authors":["Tony Lindeberg"],"pdf_url":"https://arxiv.org/pdf/2311.11317v3.pdf","comment":"38 pages, 34 figures"},{"id":"http://arxiv.org/abs/2311.15571v1","updated":"2023-11-27T06:45:22Z","published":"2023-11-27T06:45:22Z","title":"Video-based Visible-Infrared Person Re-Identification with Auxiliary\n Samples","summary":" Visible-infrared person re-identification (VI-ReID) aims to match persons\ncaptured by visible and infrared cameras, allowing person retrieval and\ntracking in 24-hour surveillance systems. Previous methods focus on learning\nfrom cross-modality person images in different cameras. However, temporal\ninformation and single-camera samples tend to be neglected. To crack this nut,\nin this paper, we first contribute a large-scale VI-ReID dataset named\nBUPTCampus. Different from most existing VI-ReID datasets, it 1) collects\ntracklets instead of images to introduce rich temporal information, 2) contains\npixel-aligned cross-modality sample pairs for better modality-invariant\nlearning, 3) provides one auxiliary set to help enhance the optimization, in\nwhich each identity only appears in a single camera. Based on our constructed\ndataset, we present a two-stream framework as baseline and apply Generative\nAdversarial Network (GAN) to narrow the gap between the two modalities. To\nexploit the advantages introduced by the auxiliary set, we propose a curriculum\nlearning based strategy to jointly learn from both primary and auxiliary sets.\nMoreover, we design a novel temporal k-reciprocal re-ranking method to refine\nthe ranking list with fine-grained temporal correlation cues. Experimental\nresults demonstrate the effectiveness of the proposed methods. We also\nreproduce 9 state-of-the-art image-based and video-based VI-ReID methods on\nBUPTCampus and our methods show substantial superiority to them. The codes and\ndataset are available at: https://github.com/dyhBUPT/BUPTCampus.\n","authors":["Yunhao Du","Cheng Lei","Zhicheng Zhao","Yuan Dong","Fei Su"],"pdf_url":"https://arxiv.org/pdf/2311.15571v1.pdf","comment":"Accepted by Transactions on Information Forensics & Security 2023"},{"id":"http://arxiv.org/abs/2311.15570v1","updated":"2023-11-27T06:38:07Z","published":"2023-11-27T06:38:07Z","title":"UFDA: Universal Federated Domain Adaptation with Practical Assumptions","summary":" Conventional Federated Domain Adaptation (FDA) approaches usually demand an\nabundance of assumptions, such as label set consistency, which makes them\nsignificantly less feasible for real-world situations and introduces security\nhazards. In this work, we propose a more practical scenario named Universal\nFederated Domain Adaptation (UFDA). It only requires the black-box model and\nthe label set information of each source domain, while the label sets of\ndifferent source domains could be inconsistent and the target-domain label set\nis totally blind. This relaxes the assumptions made by FDA, which are often\nchallenging to meet in real-world cases and diminish model security. To address\nthe UFDA scenario, we propose a corresponding framework called Hot-Learning\nwith Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain\nshifts and category gaps problem by using one-hot outputs from the black-box\nmodels of various source domains. Moreover, to better distinguish the shared\nand unknown classes, we further present a cluster-level strategy named\nMutual-Voting Decision (MVD) to extract robust consensus knowledge across peer\nclasses from both source and target domains. The extensive experiments on three\nbenchmarks demonstrate that our HCLD achieves comparable performance for our\nUFDA scenario with much fewer assumptions, compared to the previous\nmethodologies with many additional assumptions.\n","authors":["Xinhui Liu","Zhenghao Chen","Luping Zhou","Dong Xu","Wei Xi","Gairui Bai","Yihan Zhao","Jizhong Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.15570v1.pdf","comment":"Submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2311.15569v1","updated":"2023-11-27T06:37:05Z","published":"2023-11-27T06:37:05Z","title":"Improving Adaptability and Generalizability of Efficient Transfer\n Learning for Vision-Language Models","summary":" Vision-Language Models (VLMs) like CLIP have demonstrated remarkable\napplicability across a variety of downstream tasks, including zero-shot image\nclassification. Recently, the use of prompts or adapters for efficient transfer\nlearning has gained significant attention for effectively adapting to\ndownstream tasks. However, the roles of vision and text prompts, as well as\nadapters in terms of generalization and transfer difficulty, have been\noverlooked, limiting performance on unseen tasks. In this paper, we empirically\nanalyze how VLMs behave when using vision and text prompts, adapters, and a\ncombination of these components, marking a novel exploration by our study. Our\nobservations find that utilizing vision prompts for class separability and text\nadapters for task adaptation is crucial for adaptability and generalizability.\nMoreover, to improve generalization across every domain, we propose an adaptive\nensemble method that effectively combines the general knowledge of VLMs with\ntask-specific knowledge according to transfer difficulty. Upon experimenting\nwith extensive benchmarks, our method consistently outperforms all baselines,\nparticularly on unseen tasks, demonstrating the effectiveness of our proposed\napproach.\n","authors":["Yongjin Yang","Jongwoo Ko","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2311.15569v1.pdf","comment":"11 pages (19 pages including supplementary), 10 figures (12 figures\n including supplementary), 6 tables (17 tables including supplementary)"},{"id":"http://arxiv.org/abs/2310.17294v3","updated":"2023-11-27T06:21:03Z","published":"2023-10-26T10:18:51Z","title":"Scale-Adaptive Feature Aggregation for Efficient Space-Time Video\n Super-Resolution","summary":" The Space-Time Video Super-Resolution (STVSR) task aims to enhance the visual\nquality of videos, by simultaneously performing video frame interpolation (VFI)\nand video super-resolution (VSR). However, facing the challenge of the\nadditional temporal dimension and scale inconsistency, most existing STVSR\nmethods are complex and inflexible in dynamically modeling different motion\namplitudes. In this work, we find that choosing an appropriate processing scale\nachieves remarkable benefits in flow-based feature propagation. We propose a\nnovel Scale-Adaptive Feature Aggregation (SAFA) network that adaptively selects\nsub-networks with different processing scales for individual samples.\nExperiments on four public STVSR benchmarks demonstrate that SAFA achieves\nstate-of-the-art performance. Our SAFA network outperforms recent\nstate-of-the-art methods such as TMNet and VideoINR by an average improvement\nof over 0.5dB on PSNR, while requiring less than half the number of parameters\nand only 1/3 computational costs.\n","authors":["Zhewei Huang","Ailin Huang","Xiaotao Hu","Chen Hu","Jun Xu","Shuchang Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.17294v3.pdf","comment":"WACV2024, 16 pages"},{"id":"http://arxiv.org/abs/2311.15562v1","updated":"2023-11-27T06:19:00Z","published":"2023-11-27T06:19:00Z","title":"Fully Authentic Visual Question Answering Dataset from Online\n Communities","summary":" Visual Question Answering (VQA) entails answering questions about images. We\nintroduce the first VQA dataset in which all contents originate from an\nauthentic use case. Sourced from online question answering community forums, we\ncall it VQAonline. We then characterize our dataset and how it relates to eight\nother VQA datasets. Observing that answers in our dataset tend to be much\nlonger (e.g., with a mean of 173 words) and thus incompatible with standard VQA\nevaluation metrics, we next analyze which of the six popular metrics for longer\ntext evaluation align best with human judgments. We then use the best-suited\nmetrics to evaluate six state-of-the-art vision and language foundation models\non VQAonline and reveal where they struggle most. We will release the dataset\nsoon to facilitate future extensions.\n","authors":["Chongyan Chen","Mengchen Liu","Noel Codella","Yunsheng Li","Lu Yuan","Danna Gurari"],"pdf_url":"https://arxiv.org/pdf/2311.15562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15561v1","updated":"2023-11-27T06:14:23Z","published":"2023-11-27T06:14:23Z","title":"ET3D: Efficient Text-to-3D Generation via Multi-View Distillation","summary":" Recent breakthroughs in text-to-image generation has shown encouraging\nresults via large generative models. Due to the scarcity of 3D assets, it is\nhardly to transfer the success of text-to-image generation to that of\ntext-to-3D generation. Existing text-to-3D generation methods usually adopt the\nparadigm of DreamFusion, which conducts per-asset optimization by distilling a\npretrained text-to-image diffusion model. The generation speed usually ranges\nfrom several minutes to tens of minutes per 3D asset, which degrades the user\nexperience and also imposes a burden to the service providers due to the high\ncomputational budget.\n In this work, we present an efficient text-to-3D generation method, which\nrequires only around 8 $ms$ to generate a 3D asset given the text prompt on a\nconsumer graphic card. The main insight is that we exploit the images generated\nby a large pre-trained text-to-image diffusion model, to supervise the training\nof a text conditioned 3D generative adversarial network. Once the network is\ntrained, we are able to efficiently generate a 3D asset via a single forward\npass. Our method requires no 3D training data and provides an alternative\napproach for efficient text-to-3D generation by distilling pre-trained image\ndiffusion models.\n","authors":["Yiming Chen","Zhiqi Li","Peidong Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12386v2","updated":"2023-11-27T05:58:45Z","published":"2023-11-21T06:55:21Z","title":"Point, Segment and Count: A Generalized Framework for Object Counting","summary":" Class-agnostic object counting aims to count all objects in an image with\nrespect to example boxes or class names, \\emph{a.k.a} few-shot and zero-shot\ncounting. Current state-of-the-art methods highly rely on density maps to\npredict object counts, which lacks model interpretability. In this paper, we\npropose a generalized framework for both few-shot and zero-shot object counting\nbased on detection. Our framework combines the superior advantages of two\nfoundation models without compromising their zero-shot capability: (\\textbf{i})\nSAM to segment all possible objects as mask proposals, and (\\textbf{ii}) CLIP\nto classify proposals to obtain accurate object counts. However, this strategy\nmeets the obstacles of efficiency overhead and the small crowded objects that\ncannot be localized and distinguished. To address these issues, our framework,\ntermed PseCo, follows three steps: point, segment, and count. Specifically, we\nfirst propose a class-agnostic object localization to provide accurate but\nleast point prompts for SAM, which consequently not only reduces computation\ncosts but also avoids missing small objects. Furthermore, we propose a\ngeneralized object classification that leverages CLIP image/text embeddings as\nthe classifier, following a hierarchical knowledge distillation to obtain\ndiscriminative classifications among hierarchical mask proposals. Extensive\nexperimental results on FSC-147 dataset demonstrate that PseCo achieves\nstate-of-the-art performance in both few-shot/zero-shot object\ncounting/detection, with additional results on large-scale COCO and LVIS\ndatasets. The source code is available at\n\\url{https://github.com/Hzzone/PseCo}.\n","authors":["Zhizhong Huang","Mingliang Dai","Yi Zhang","Junping Zhang","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2311.12386v2.pdf","comment":"Fix typos"},{"id":"http://arxiv.org/abs/2311.15556v1","updated":"2023-11-27T05:53:03Z","published":"2023-11-27T05:53:03Z","title":"PKU-I2IQA: An Image-to-Image Quality Assessment Database for AI\n Generated Images","summary":" With the development of image generation technology, AI-based image\ngeneration has been applied in various fields. However, the development of AIGC\nimage generative models also brings new problems and challenges. A significant\nchallenge is that AI-generated images (AIGI) compared to natural images may\nhave some unique distortions, and not all generated images meet the\nrequirements of the real world, so it is of great significance to evaluate\nAI-generated images more comprehensively. Although previous work has\nestablished some human perception-based AIGC image quality assessment databases\nfor text-generated images, the AI image generation technology includes\nscenarios like text-to-image and image-to-image, and assessing only the images\ngenerated by text-to-image models is insufficient. To address this issue, we\nhave established a human perception-based image-to-image AIGC image quality\nassessment database, named PKU-I2IQA. We conducted a comprehensive analysis of\nthe PKU-I2IQA database. Furthermore, we introduced two benchmark models:\nNR-AIGCIQA based on no-reference image quality assessment and FR-AIGCIQA based\non full-reference image quality assessment.Finally, leveraging this database,\nwe conducted benchmark experiments and compared the performance of the proposed\nbenchmark models. The PKU-I2IQA database and benchmarks will be released to\nfacilitate future research on https://github.com/jiquan123/I2IQA.\n Keywords: AIGC, image-to-image generation, image quality assessment,\nNR-AIGCIQA, FR-AIGCIQA\n","authors":["Jiquan Yuan","Xinyan Cao","Changjin Li","Fanyi Yang","Jinlong Lin","Xixin Cao"],"pdf_url":"https://arxiv.org/pdf/2311.15556v1.pdf","comment":"18 pages"},{"id":"http://arxiv.org/abs/2311.15551v1","updated":"2023-11-27T05:35:49Z","published":"2023-11-27T05:35:49Z","title":"Instruct2Attack: Language-Guided Semantic Adversarial Attacks","summary":" We propose Instruct2Attack (I2A), a language-guided semantic attack that\ngenerates semantically meaningful perturbations according to free-form language\ninstructions. We make use of state-of-the-art latent diffusion models, where we\nadversarially guide the reverse diffusion process to search for an adversarial\nlatent code conditioned on the input image and text instruction. Compared to\nexisting noise-based and semantic attacks, I2A generates more natural and\ndiverse adversarial examples while providing better controllability and\ninterpretability. We further automate the attack process with GPT-4 to generate\ndiverse image-specific text instructions. We show that I2A can successfully\nbreak state-of-the-art deep neural networks even under strong adversarial\ndefenses, and demonstrate great transferability among a variety of network\narchitectures.\n","authors":["Jiang Liu","Chen Wei","Yuxiang Guo","Heng Yu","Alan Yuille","Soheil Feizi","Chun Pong Lau","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2311.15551v1.pdf","comment":"under submission, code coming soon"},{"id":"http://arxiv.org/abs/2311.15547v1","updated":"2023-11-27T05:23:01Z","published":"2023-11-27T05:23:01Z","title":"Dataset Distillation in Latent Space","summary":" Dataset distillation (DD) is a newly emerging research area aiming at\nalleviating the heavy computational load in training models on large datasets.\nIt tries to distill a large dataset into a small and condensed one so that\nmodels trained on the distilled dataset can perform comparably with those\ntrained on the full dataset when performing downstream tasks. Among the\nprevious works in this area, there are three key problems that hinder the\nperformance and availability of the existing DD methods: high time complexity,\nhigh space complexity, and low info-compactness. In this work, we\nsimultaneously attempt to settle these three problems by moving the DD\nprocesses from conventionally used pixel space to latent space. Encoded by a\npretrained generic autoencoder, latent codes in the latent space are naturally\ninfo-compact representations of the original images in much smaller sizes.\nAfter transferring three mainstream DD algorithms to latent space, we\nsignificantly reduce time and space consumption while achieving similar\nperformance, allowing us to distill high-resolution datasets or target at\ngreater data ratio that previous methods have failed. Besides, within the same\nstorage budget, we can also quantitatively deliver more latent codes than\npixel-level images, which further boosts the performance of our methods.\n","authors":["Yuxuan Duan","Jianfu Zhang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15547v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.15543v1","updated":"2023-11-27T05:20:11Z","published":"2023-11-27T05:20:11Z","title":"Beyond Pixels: Exploring Human-Readable SVG Generation for Simple Images\n with Vision Language Models","summary":" In the field of computer graphics, the use of vector graphics, particularly\nScalable Vector Graphics (SVG), represents a notable development from\ntraditional pixel-based imagery. SVGs, with their XML-based format, are\ndistinct in their ability to directly and explicitly represent visual elements\nsuch as shape, color, and path. This direct representation facilitates a more\naccurate and logical depiction of graphical elements, enhancing reasoning and\ninterpretability. Recognizing the potential of SVGs, the machine learning\ncommunity has introduced multiple methods for image vectorization. However,\ntransforming images into SVG format while retaining the relational properties\nand context of the original scene remains a key challenge. Most vectorization\nmethods often yield SVGs that are overly complex and not easily interpretable.\nIn response to this challenge, we introduce our method, Simple-SVG-Generation\n(S\\textsuperscript{2}VG\\textsuperscript{2}). Our method focuses on producing\nSVGs that are both accurate and simple, aligning with human readability and\nunderstanding. With simple images, we evaluate our method with reasoning tasks\ntogether with advanced language models, the results show a clear improvement\nover previous SVG generation methods. We also conducted surveys for human\nevaluation on the readability of our generated SVGs, the results also favor our\nmethods.\n","authors":["Tong Zhang","Haoyang Liu","Peiyan Zhang","Yuxuan Cheng","Haohan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15543v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.15540v1","updated":"2023-11-27T05:10:15Z","published":"2023-11-27T05:10:15Z","title":"EAFP-Med: An Efficient Adaptive Feature Processing Module Based on\n Prompts for Medical Image Detection","summary":" In the face of rapid advances in medical imaging, cross-domain adaptive\nmedical image detection is challenging due to the differences in lesion\nrepresentations across various medical imaging technologies. To address this\nissue, we draw inspiration from large language models to propose EAFP-Med, an\nefficient adaptive feature processing module based on prompts for medical image\ndetection. EAFP-Med can efficiently extract lesion features of different scales\nfrom a diverse range of medical images based on prompts while being flexible\nand not limited by specific imaging techniques. Furthermore, it serves as a\nfeature preprocessing module that can be connected to any model front-end to\nenhance the lesion features in input images. Moreover, we propose a novel\nadaptive disease detection model named EAFP-Med ST, which utilizes the Swin\nTransformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med.\nWe have compared our method to nine state-of-the-art methods. Experimental\nresults demonstrate that EAFP-Med ST achieves the best performance on all three\ndatasets (chest X-ray images, cranial magnetic resonance imaging images, and\nskin images). EAFP-Med can efficiently extract lesion features from various\nmedical images based on prompts, enhancing the model's performance. This holds\nsignificant potential for improving medical image analysis and diagnosis.\n","authors":["Xiang Li","Long Lan","Husam Lahza","Shaowu Yang","Shuihua Wang","Wenjing Yang","Hengzhu Liu","Yudong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.09777v2","updated":"2023-11-27T05:09:29Z","published":"2023-09-18T13:58:42Z","title":"DriveDreamer: Towards Real-world-driven World Models for Autonomous\n Driving","summary":" World models, especially in autonomous driving, are trending and drawing\nextensive attention due to their capacity for comprehending driving\nenvironments. The established world model holds immense potential for the\ngeneration of high-quality driving videos, and driving policies for safe\nmaneuvering. However, a critical limitation in relevant research lies in its\npredominant focus on gaming environments or simulated settings, thereby lacking\nthe representation of real-world driving scenarios. Therefore, we introduce\nDriveDreamer, a pioneering world model entirely derived from real-world driving\nscenarios. Regarding that modeling the world in intricate driving scenes\nentails an overwhelming search space, we propose harnessing the powerful\ndiffusion model to construct a comprehensive representation of the complex\nenvironment. Furthermore, we introduce a two-stage training pipeline. In the\ninitial phase, DriveDreamer acquires a deep understanding of structured traffic\nconstraints, while the subsequent stage equips it with the ability to\nanticipate future states. The proposed DriveDreamer is the first world model\nestablished from real-world driving scenarios. We instantiate DriveDreamer on\nthe challenging nuScenes benchmark, and extensive experiments verify that\nDriveDreamer empowers precise, controllable video generation that faithfully\ncaptures the structural constraints of real-world traffic scenarios.\nAdditionally, DriveDreamer enables the generation of realistic and reasonable\ndriving policies, opening avenues for interaction and practical applications.\n","authors":["Xiaofeng Wang","Zheng Zhu","Guan Huang","Xinze Chen","Jiagang Zhu","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2309.09777v2.pdf","comment":"Project Page: https://drivedreamer.github.io"},{"id":"http://arxiv.org/abs/2311.15537v1","updated":"2023-11-27T05:00:38Z","published":"2023-11-27T05:00:38Z","title":"SED: A Simple Encoder-Decoder for Open-Vocabulary Semantic Segmentation","summary":" Open-vocabulary semantic segmentation strives to distinguish pixels into\ndifferent semantic groups from an open set of categories. Most existing methods\nexplore utilizing pre-trained vision-language models, in which the key is to\nadopt the image-level model for pixel-level segmentation task. In this paper,\nwe propose a simple encoder-decoder, named SED, for open-vocabulary semantic\nsegmentation, which comprises a hierarchical encoder-based cost map generation\nand a gradual fusion decoder with category early rejection. The hierarchical\nencoder-based cost map generation employs hierarchical backbone, instead of\nplain transformer, to predict pixel-level image-text cost map. Compared to\nplain transformer, hierarchical backbone better captures local spatial\ninformation and has linear computational complexity with respect to input size.\nOur gradual fusion decoder employs a top-down structure to combine cost map and\nthe feature maps of different backbone levels for segmentation. To accelerate\ninference speed, we introduce a category early rejection scheme in the decoder\nthat rejects many no-existing categories at the early layer of decoder,\nresulting in at most 4.7 times acceleration without accuracy degradation.\nExperiments are performed on multiple open-vocabulary semantic segmentation\ndatasets, which demonstrates the efficacy of our SED method. When using\nConvNeXt-B, our SED method achieves mIoU score of 31.6\\% on ADE20K with 150\ncategories at 82 millisecond ($ms$) per image on a single A6000. We will\nrelease it at \\url{https://github.com/xb534/SED.git}.\n","authors":["Bin Xie","Jiale Cao","Jin Xie","Fahad Shahbaz Khan","Yanwei Pang"],"pdf_url":"https://arxiv.org/pdf/2311.15537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15536v1","updated":"2023-11-27T04:49:24Z","published":"2023-11-27T04:49:24Z","title":"SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume\n Registration","summary":" Background and Objective: The lack of benchmark datasets has impeded the\ndevelopment of slice-to-volume registration algorithms. Such datasets are\ndifficult to annotate, primarily due to the dimensional difference within data\nand the dearth of task-specific software. We aim to develop a user-friendly\ntool to streamline dataset annotation for slice-to-volume registration.\n Methods: The proposed tool, named SVRDA, is an installation-free web\napplication for platform-agnostic collaborative dataset annotation. It enables\nefficient transformation manipulation via keyboard shortcuts and smooth case\ntransitions with auto-saving. SVRDA supports configuration-based data loading\nand adheres to the separation of concerns, offering great flexibility and\nextensibility for future research. Various supplementary features have been\nimplemented to facilitate slice-to-volume registration.\n Results: We validated the effectiveness of SVRDA by indirectly evaluating the\npost-registration segmentation quality on UK Biobank data, observing a dramatic\noverall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in\nthe 95th percentile Hausdorff distance, respectively) supported by highly\nstatistically significant evidence ($p<0.001$).We further showcased the\nclinical usage of SVRDA by integrating it into test-retest T1 quantification on\nin-house magnetic resonance images, leading to more consistent results after\nregistration.\n Conclusions: SVRDA can facilitate collaborative annotation of benchmark\ndatasets while being potentially applicable to other pipelines incorporating\nslice-to-volume registration. Full source code and documentation are available\nat https://github.com/Roldbach/SVRDA\n","authors":["Weixun Luo","Alexandre Triay Bagur","Paul Aljabar","George Ralli","Sir Michael Brady"],"pdf_url":"https://arxiv.org/pdf/2311.15536v1.pdf","comment":"18 pages, 11 figures, In submission to Computer Methods and Programs\n in Biomedicine"},{"id":"http://arxiv.org/abs/2310.01852v6","updated":"2023-11-27T04:28:58Z","published":"2023-10-03T07:33:27Z","title":"LanguageBind: Extending Video-Language Pretraining to N-modality by\n Language-based Semantic Alignment","summary":" The video-language (VL) pretraining has achieved remarkable improvement in\nmultiple downstream tasks. However, the current VL pretraining framework is\nhard to extend to multiple modalities (N modalities, N>=3) beyond vision and\nlanguage. We thus propose LanguageBind, taking the language as the bind across\ndifferent modalities because the language modality is well-explored and\ncontains rich semantics. Specifically, we freeze the language encoder acquired\nby VL pretraining, then train encoders for other modalities with contrastive\nlearning. As a result, all modalities are mapped to a shared feature space,\nimplementing multi-modal semantic alignment. While LanguageBind ensures that we\ncan extend VL modalities to N modalities, we also need a high-quality dataset\nwith alignment data pairs centered on language. We thus propose VIDAL-10M with\nVideo, Infrared, Depth, Audio and their corresponding Language, naming as\nVIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with\ncomplete semantics rather than truncated segments from long videos, and all the\nvideo, depth, infrared, and audio modalities are aligned to their textual\ndescriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 5.8%\nR@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot\nvideo-text retrieval task. Beyond this, our LanguageBind has greatly improved\nin the zero-shot video, audio, depth, and infrared understanding tasks. For\ninstance, LanguageBind surpassing InterVideo by 1.9% on MSR-VTT, 8.8% on MSVD,\n6.3% on DiDeMo, and 4.4% on ActivityNet. On the LLVIP and NYU-D datasets,\nLanguageBind outperforms ImageBind with 23.8% and 11.1% top-1 accuracy. Code\naddress: https://github.com/PKU-YuanGroup/LanguageBind.\n","authors":["Bin Zhu","Bin Lin","Munan Ning","Yang Yan","Jiaxi Cui","HongFa Wang","Yatian Pang","Wenhao Jiang","Junwu Zhang","Zongwei Li","Wancai Zhang","Zhifeng Li","Wei Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2310.01852v6.pdf","comment":"Under review as a conference paper at ICLR 2024"},{"id":"http://arxiv.org/abs/2310.18332v2","updated":"2023-11-27T04:22:54Z","published":"2023-10-20T12:44:44Z","title":"WordArt Designer: User-Driven Artistic Typography Synthesis using Large\n Language Models","summary":" This paper introduces WordArt Designer, a user-driven framework for artistic\ntypography synthesis, relying on the Large Language Model (LLM). The system\nincorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo\nmodules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets\nuser inputs and generates actionable prompts for the other modules, thereby\ntransforming abstract concepts into tangible designs. 2) The SemTypo module\noptimizes font designs using semantic concepts, striking a balance between\nartistic transformation and readability. 3) Building on the semantic layout\nprovided by the SemTypo module, the StyTypo module creates smooth, refined\nimages. 4) The TexTypo module further enhances the design's aesthetics through\ntexture rendering, enabling the generation of inventive textured fonts.\nNotably, WordArt Designer highlights the fusion of generative AI with artistic\ntypography. Experience its capabilities on ModelScope:\nhttps://www.modelscope.cn/studios/WordArt/WordArt.\n","authors":["Jun-Yan He","Zhi-Qi Cheng","Chenyang Li","Jingdong Sun","Wangmeng Xiang","Xianhui Lin","Xiaoyang Kang","Zengke Jin","Yusen Hu","Bin Luo","Yifeng Geng","Xuansong Xie","Jingren Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.18332v2.pdf","comment":"Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is\n at https://www.modelscope.cn/studios/WordArt/WordArt"},{"id":"http://arxiv.org/abs/2311.15529v1","updated":"2023-11-27T04:22:48Z","published":"2023-11-27T04:22:48Z","title":"Efficient Dataset Distillation via Minimax Diffusion","summary":" Dataset distillation reduces the storage and computational consumption of\ntraining a network by generating a small surrogate dataset that encapsulates\nrich information of the original large-scale one. However, previous\ndistillation methods heavily rely on the sample-wise iterative optimization\nscheme. As the images-per-class (IPC) setting or image resolution grows larger,\nthe necessary computation will demand overwhelming time and resources. In this\nwork, we intend to incorporate generative diffusion techniques for computing\nthe surrogate dataset. Observing that key factors for constructing an effective\nsurrogate dataset are representativeness and diversity, we design additional\nminimax criteria in the generative training to enhance these facets for the\ngenerated images of diffusion models. We present a theoretical model of the\nprocess as hierarchical diffusion control demonstrating the flexibility of the\ndiffusion process to target these criteria without jeopardizing the\nfaithfulness of the sample to the desired distribution. The proposed method\nachieves state-of-the-art validation performance while demanding much less\ncomputational resources. Under the 100-IPC setting on ImageWoof, our method\nrequires less than one-twentieth the distillation time of previous methods, yet\nyields even better performance. Source code available in\nhttps://github.com/vimar-gu/MinimaxDiffusion.\n","authors":["Jianyang Gu","Saeed Vahidian","Vyacheslav Kungurtsev","Haonan Wang","Wei Jiang","Yang You","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.12532v2","updated":"2023-11-27T03:33:37Z","published":"2023-08-24T03:43:02Z","title":"FedSoL: Bridging Global Alignment and Local Generality in Federated\n Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v2.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2310.07206v2","updated":"2023-11-27T03:32:21Z","published":"2023-10-11T05:34:36Z","title":"DeepSimHO: Stable Pose Estimation for Hand-Object Interaction via\n Physics Simulation","summary":" This paper addresses the task of 3D pose estimation for a hand interacting\nwith an object from a single image observation. When modeling hand-object\ninteraction, previous works mainly exploit proximity cues, while overlooking\nthe dynamical nature that the hand must stably grasp the object to counteract\ngravity and thus preventing the object from slipping or falling. These works\nfail to leverage dynamical constraints in the estimation and consequently often\nproduce unstable results. Meanwhile, refining unstable configurations with\nphysics-based reasoning remains challenging, both by the complexity of contact\ndynamics and by the lack of effective and efficient physics inference in the\ndata-driven learning framework. To address both issues, we present DeepSimHO: a\nnovel deep-learning pipeline that combines forward physics simulation and\nbackward gradient approximation with a neural network. Specifically, for an\ninitial hand-object pose estimated by a base network, we forward it to a\nphysics simulator to evaluate its stability. However, due to non-smooth contact\ngeometry and penetration, existing differentiable simulators can not provide\nreliable state gradient. To remedy this, we further introduce a deep network to\nlearn the stability evaluation process from the simulator, while smoothly\napproximating its gradient and thus enabling effective back-propagation.\nExtensive experiments show that our method noticeably improves the stability of\nthe estimation and achieves superior efficiency over test-time optimization.\nThe code is available at https://github.com/rongakowang/DeepSimHO.\n","authors":["Rong Wang","Wei Mao","Hongdong Li"],"pdf_url":"https://arxiv.org/pdf/2310.07206v2.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.15512v1","updated":"2023-11-27T03:15:48Z","published":"2023-11-27T03:15:48Z","title":"Sparse Pedestrian Character Learning for Trajectory Prediction","summary":" Pedestrian trajectory prediction in a first-person view has recently\nattracted much attention due to its importance in autonomous driving. Recent\nwork utilizes pedestrian character information, \\textit{i.e.}, action and\nappearance, to improve the learned trajectory embedding and achieves\nstate-of-the-art performance. However, it neglects the invalid and negative\npedestrian character information, which is harmful to trajectory representation\nand thus leads to performance degradation. To address this issue, we present a\ntwo-stream sparse-character-based network~(TSNet) for pedestrian trajectory\nprediction. Specifically, TSNet learns the negative-removed characters in the\nsparse character representation stream to improve the trajectory embedding\nobtained in the trajectory representation stream. Moreover, to model the\nnegative-removed characters, we propose a novel sparse character graph,\nincluding the sparse category and sparse temporal character graphs, to learn\nthe different effects of various characters in category and temporal\ndimensions, respectively. Extensive experiments on two first-person view\ndatasets, PIE and JAAD, show that our method outperforms existing\nstate-of-the-art methods. In addition, ablation studies demonstrate different\neffects of various characters and prove that TSNet outperforms approaches\nwithout eliminating negative characters.\n","authors":["Yonghao Dong","Le Wang","Sanpin Zhou","Gang Hua","Changyin Sun"],"pdf_url":"https://arxiv.org/pdf/2311.15512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15510v1","updated":"2023-11-27T03:09:58Z","published":"2023-11-27T03:09:58Z","title":"CaesarNeRF: Calibrated Semantic Representation for Few-shot\n Generalizable Neural Rendering","summary":" Generalizability and few-shot learning are key challenges in Neural Radiance\nFields (NeRF), often due to the lack of a holistic understanding in pixel-level\nrendering. We introduce CaesarNeRF, an end-to-end approach that leverages\nscene-level CAlibratEd SemAntic Representation along with pixel-level\nrepresentations to advance few-shot, generalizable neural rendering,\nfacilitating a holistic understanding without compromising high-quality\ndetails. CaesarNeRF explicitly models pose differences of reference views to\ncombine scene-level semantic representations, providing a calibrated holistic\nunderstanding. This calibration process aligns various viewpoints with precise\nlocation and is further enhanced by sequential refinement to capture varying\ndetails. Extensive experiments on public datasets, including LLFF, Shiny,\nmip-NeRF 360, and MVImgNet, show that CaesarNeRF delivers state-of-the-art\nperformance across varying numbers of reference views, proving effective even\nwith a single reference image. The project page of this work can be found at\nhttps://haidongz-usc.github.io/project/caesarnerf.\n","authors":["Haidong Zhu","Tianyu Ding","Tianyi Chen","Ilya Zharkov","Ram Nevatia","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2311.15510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11810v2","updated":"2023-11-27T02:53:33Z","published":"2023-11-20T14:42:25Z","title":"DocPedia: Unleashing the Power of Large Multimodal Model in the\n Frequency Domain for Versatile Document Understanding","summary":" This work presents DocPedia, a novel large multimodal model (LMM) for\nversatile OCR-free document understanding, capable of parsing images up to\n2,560$\\times$2,560 resolution. Unlike existing work either struggle with\nhigh-resolution documents or give up the large language model thus vision or\nlanguage ability constrained, our DocPedia directly processes visual input in\nthe frequency domain rather than the pixel space. The unique characteristic\nenables DocPedia to capture a greater amount of visual and textual information\nusing a limited number of visual tokens. To consistently enhance both\nperception and comprehension abilities of our model, we develop a dual-stage\ntraining strategy and enrich instructions/annotations of all training tasks\ncovering multiple document types. Extensive quantitative and qualitative\nexperiments conducted on various publicly available benchmarks confirm the\nmutual benefits of jointly learning perception and comprehension tasks. The\nresults provide further evidence of the effectiveness and superior performance\nof our DocPedia over other methods.\n","authors":["Hao Feng","Qi Liu","Hao Liu","Wengang Zhou","Houqiang Li","Can Huang"],"pdf_url":"https://arxiv.org/pdf/2311.11810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15497v1","updated":"2023-11-27T02:48:06Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n and Optimization Functions for Enhanced Precision","summary":" Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed that an improvement of 0.3\\% in testing when utilizing the best\nperforming state-of-the-art model as the backbone of the framework, while\nmaintaining the same inference time and with only a 0.8\\% loss in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13404v2","updated":"2023-11-27T02:33:36Z","published":"2023-11-22T14:00:23Z","title":"Animatable 3D Gaussians for High-fidelity Synthesis of Human Motions","summary":" We present a novel animatable 3D Gaussian model for rendering high-fidelity\nfree-view human motions in real time. Compared to existing NeRF-based methods,\nthe model owns better capability in synthesizing high-frequency details without\nthe jittering problem across video frames. The core of our model is a novel\naugmented 3D Gaussian representation, which attaches each Gaussian with a\nlearnable code. The learnable code serves as a pose-dependent appearance\nembedding for refining the erroneous appearance caused by geometric\ntransformation of Gaussians, based on which an appearance refinement model is\nlearned to produce residual Gaussian properties to match the appearance in\ntarget pose. To force the Gaussians to learn the foreground human only without\nbackground interference, we further design a novel alpha loss to explicitly\nconstrain the Gaussians within the human body. We also propose to jointly\noptimize the human joint parameters to improve the appearance accuracy. The\nanimatable 3D Gaussian model can be learned with shallow MLPs, so new human\nmotions can be synthesized in real time (66 fps on avarage). Experiments show\nthat our model has superior performance over NeRF-based methods.\n","authors":["Keyang Ye","Tianjia Shao","Kun Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.13404v2.pdf","comment":"Some experiment data is wrong. The expression of the paper in\n introduction and abstract is incorrect. Some graphs have inappropriate\n descriptions"},{"id":"http://arxiv.org/abs/2311.15478v1","updated":"2023-11-27T01:41:25Z","published":"2023-11-27T01:41:25Z","title":"AerialBooth: Mutual Information Guidance for Text Controlled Aerial View\n Synthesis from a Single Image","summary":" We present a novel method, AerialBooth, for synthesizing the aerial view from\na single input image using its text description. We leverage the pretrained\ntext-to-2D image stable diffusion model as prior knowledge of the 3D world. The\nmodel is finetuned in two steps to optimize for the text embedding and the UNet\nthat reconstruct the input image and its inverse perspective mapping\nrespectively. The inverse perspective mapping creates variance within the\ntext-image space of the diffusion model, while providing weak guidance for\naerial view synthesis. At inference, we steer the contents of the generated\nimage towards the input image using novel mutual information guidance that\nmaximizes the information content between the probability distributions of the\ntwo images. We evaluate our approach on a wide spectrum of real and synthetic\ndata, including natural scenes, indoor scenes, human action, etc. Through\nextensive experiments and ablation studies, we demonstrate the effectiveness of\nAerialBooth and also its generalizability to other text-controlled views. We\nalso show that AerialBooth achieves the best viewpoint-fidelity trade-off\nthough quantitative evaluation on 7 metrics analyzing viewpoint and fidelity\nw.r.t. input image. Code and data is available at\nhttps://github.com/divyakraman/AerialBooth2023.\n","authors":["Divya Kothandaraman","Tianyi Zhou","Ming Lin","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2311.15478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15477v1","updated":"2023-11-27T01:24:31Z","published":"2023-11-27T01:24:31Z","title":"DreamCreature: Crafting Photorealistic Virtual Creatures from\n Imagination","summary":" Recent text-to-image (T2I) generative models allow for high-quality synthesis\nfollowing either text instructions or visual examples. Despite their\ncapabilities, these models face limitations in creating new, detailed creatures\nwithin specific categories (e.g., virtual dog or bird species), which are\nvaluable in digital asset creation and biodiversity analysis. To bridge this\ngap, we introduce a novel task, Virtual Creatures Generation: Given a set of\nunlabeled images of the target concepts (e.g., 200 bird species), we aim to\ntrain a T2I model capable of creating new, hybrid concepts within diverse\nbackgrounds and contexts. We propose a new method called DreamCreature, which\nidentifies and extracts the underlying sub-concepts (e.g., body parts of a\nspecific species) in an unsupervised manner. The T2I thus adapts to generate\nnovel concepts (e.g., new bird species) with faithful structures and\nphotorealistic appearance by seamlessly and flexibly composing learned\nsub-concepts. To enhance sub-concept fidelity and disentanglement, we extend\nthe textual inversion technique by incorporating an additional projector and\ntailored attention loss regularization. Extensive experiments on two\nfine-grained image benchmarks demonstrate the superiority of DreamCreature over\nprior methods in both qualitative and quantitative evaluation. Ultimately, the\nlearned sub-concepts facilitate diverse creative applications, including\ninnovative consumer product designs and nuanced property modifications.\n","authors":["Kam Woh Ng","Xiatian Zhu","Yi-Zhe Song","Tao Xiang"],"pdf_url":"https://arxiv.org/pdf/2311.15477v1.pdf","comment":"Website: https://github.com/kamwoh/dreamcreature"},{"id":"http://arxiv.org/abs/2311.15475v1","updated":"2023-11-27T01:20:11Z","published":"2023-11-27T01:20:11Z","title":"MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers","summary":" We introduce MeshGPT, a new approach for generating triangle meshes that\nreflects the compactness typical of artist-created meshes, in contrast to dense\ntriangle meshes extracted by iso-surfacing methods from neural fields. Inspired\nby recent advances in powerful large language models, we adopt a sequence-based\napproach to autoregressively generate triangle meshes as sequences of\ntriangles. We first learn a vocabulary of latent quantized embeddings, using\ngraph convolutions, which inform these embeddings of the local mesh geometry\nand topology. These embeddings are sequenced and decoded into triangles by a\ndecoder, ensuring that they can effectively reconstruct the mesh. A transformer\nis then trained on this learned vocabulary to predict the index of the next\nembedding given previous embeddings. Once trained, our model can be\nautoregressively sampled to generate new triangle meshes, directly generating\ncompact meshes with sharp edges, more closely imitating the efficient\ntriangulation patterns of human-crafted meshes. MeshGPT demonstrates a notable\nimprovement over state of the art mesh generation methods, with a 9% increase\nin shape coverage and a 30-point enhancement in FID scores across various\ncategories.\n","authors":["Yawar Siddiqui","Antonio Alliegro","Alexey Artemov","Tatiana Tommasi","Daniele Sirigatti","Vladislav Rosov","Angela Dai","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2311.15475v1.pdf","comment":"Project Page: https://nihalsid.github.io/mesh-gpt/, Video:\n https://youtu.be/UV90O1_69_o"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.16075v1","updated":"2023-11-27T18:46:17Z","published":"2023-11-27T18:46:17Z","title":"BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical\n Knowledge Graph Insights","summary":" In this study, we investigate the potential of Large Language Models to\ncomplement biomedical knowledge graphs in the training of semantic models for\nthe biomedical and clinical domains. Drawing on the wealth of the UMLS\nknowledge graph and harnessing cutting-edge Large Language Models, we propose a\nnew state-of-the-art approach for obtaining high-fidelity representations of\nbiomedical concepts and sentences, consisting of three steps: an improved\ncontrastive learning phase, a novel self-distillation phase, and a weight\naveraging phase. Through rigorous evaluations via the extensive BioLORD testing\nsuite and diverse downstream tasks, we demonstrate consistent and substantial\nperformance improvements over the previous state of the art (e.g. +2pts on\nMedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new\nstate-of-the-art biomedical model for English, we also distill and release a\nmultilingual model compatible with 50+ languages and finetuned on 7 European\nlanguages. Many clinical pipelines can benefit from our latest models. Our new\nmultilingual model enables a range of languages to benefit from our\nadvancements in biomedical semantic representation learning, opening a new\navenue for bioinformatics researchers around the world. As a result, we hope to\nsee BioLORD-2023 becoming a precious tool for future biomedical applications.\n","authors":["François Remy","Kris Demuynck","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.16075v1.pdf","comment":"Preprint of upcoming journal article"},{"id":"http://arxiv.org/abs/2310.13540v3","updated":"2023-11-27T15:33:04Z","published":"2023-10-20T14:36:09Z","title":"Thoroughly Modeling Multi-domain Pre-trained Recommendation as Language","summary":" With the thriving of pre-trained language model (PLM) widely verified in\nvarious of NLP tasks, pioneer efforts attempt to explore the possible\ncooperation of the general textual information in PLM with the personalized\nbehavioral information in user historical behavior sequences to enhance\nsequential recommendation (SR). However, despite the commonalities of input\nformat and task goal, there are huge gaps between the behavioral and textual\ninformation, which obstruct thoroughly modeling SR as language modeling via\nPLM. To bridge the gap, we propose a novel Unified pre-trained language model\nenhanced sequential recommendation (UPSR), aiming to build a unified\npre-trained recommendation model for multi-domain recommendation tasks. We\nformally design five key indicators, namely naturalness, domain consistency,\ninformativeness, noise & ambiguity, and text length, to guide the text-item\nadaptation and behavior sequence-text sequence adaptation differently for\npre-training and fine-tuning stages, which are essential but under-explored by\nprevious works. In experiments, we conduct extensive evaluations on seven\ndatasets with both tuning and zero-shot settings and achieve the overall best\nperformance. Comprehensive model analyses also provide valuable insights for\nbehavior modeling via PLM, shedding light on large pre-trained recommendation\nmodels. The source codes will be released in the future.\n","authors":["Zekai Qu","Ruobing Xie","Chaojun Xiao","Yuan Yao","Zhiyuan Liu","Fengzong Lian","Zhanhui Kang","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2310.13540v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15923v1","updated":"2023-11-27T15:32:52Z","published":"2023-11-27T15:32:52Z","title":"SEINE: SEgment-based Indexing for NEural information retrieval","summary":" Many early neural Information Retrieval (NeurIR) methods are re-rankers that\nrely on a traditional first-stage retriever due to expensive query time\ncomputations. Recently, representation-based retrievers have gained much\nattention, which learns query representation and document representation\nseparately, making it possible to pre-compute document representations offline\nand reduce the workload at query time. Both dense and sparse\nrepresentation-based retrievers have been explored. However, these methods\nfocus on finding the representation that best represents a text (aka metric\nlearning) and the actual retrieval function that is responsible for similarity\nmatching between query and document is kept at a minimum by using dot product.\nOne drawback is that unlike traditional term-level inverted index, the index\nformed by these embeddings cannot be easily re-used by another retrieval\nmethod. Another drawback is that keeping the interaction at minimum hurts\nretrieval effectiveness. On the contrary, interaction-based retrievers are\nknown for their better retrieval effectiveness. In this paper, we propose a\nnovel SEgment-based Neural Indexing method, SEINE, which provides a general\nindexing framework that can flexibly support a variety of interaction-based\nneural retrieval methods. We emphasize on a careful decomposition of common\ncomponents in existing neural retrieval methods and propose to use\nsegment-level inverted index to store the atomic query-document interaction\nvalues. Experiments on LETOR MQ2007 and MQ2008 datasets show that our indexing\nmethod can accelerate multiple neural retrieval methods up to 28-times faster\nwithout sacrificing much effectiveness.\n","authors":["Sibo Dong","Justin Goldstein","Grace Hui Yang"],"pdf_url":"https://arxiv.org/pdf/2311.15923v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14084v2","updated":"2023-11-27T13:43:19Z","published":"2023-11-23T16:22:58Z","title":"AI-Generated Images Introduce Invisible Relevance Bias to Text-Image\n Retrieval","summary":" With the advancement of generation models, AI-generated content (AIGC) is\nbecoming more realistic, flooding the Internet. A recent study suggests that\nthis phenomenon has elevated the issue of source bias in text retrieval for web\nsearches. Specifically, neural retrieval models tend to rank generated texts\nhigher than human-written texts. In this paper, we extend the study of this\nbias to cross-modal retrieval. Firstly, we successfully construct a suitable\nbenchmark to explore the existence of the bias. Subsequent extensive\nexperiments on this benchmark reveal that AI-generated images introduce an\ninvisible relevance bias to text-image retrieval models. Specifically, our\nexperiments show that text-image retrieval models tend to rank the AI-generated\nimages higher than the real images, even though the AI-generated images do not\nexhibit more visually relevant features to the query than real images. This\ninvisible relevance bias is prevalent across retrieval models with varying\ntraining data and architectures. Furthermore, our subsequent exploration\nreveals that the inclusion of AI-generated images in the training data of the\nretrieval models exacerbates the invisible relevance bias. The above phenomenon\ntriggers a vicious cycle, which makes the invisible relevance bias become more\nand more serious. To elucidate the potential causes of invisible relevance and\naddress the aforementioned issues, we introduce an effective training method\naimed at alleviating the invisible relevance bias. Subsequently, we apply our\nproposed debiasing method to retroactively identify the causes of invisible\nrelevance, revealing that the AI-generated images induce the image encoder to\nembed additional information into their representation. This information\nexhibits a certain consistency across generated images with different semantics\nand can make the retriever estimate a higher relevance score.\n","authors":["Shicheng Xu","Danyang Hou","Liang Pang","Jingcheng Deng","Jun Xu","Huawei Shen","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2311.14084v2.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2311.15790v1","updated":"2023-11-27T13:04:33Z","published":"2023-11-27T13:04:33Z","title":"A Social-aware Gaussian Pre-trained Model for Effective Cold-start\n Recommendation","summary":" The use of pre-training is an emerging technique to enhance a neural model's\nperformance, which has been shown to be effective for many neural language\nmodels such as BERT. This technique has also been used to enhance the\nperformance of recommender systems. In such recommender systems, pre-training\nmodels are used to learn a better initialisation for both users and items.\nHowever, recent existing pre-trained recommender systems tend to only\nincorporate the user interaction data at the pre-training stage, making it\ndifficult to deliver good recommendations, especially when the interaction data\nis sparse. To alleviate this common data sparsity issue, we propose to\npre-train the recommendation model not only with the interaction data but also\nwith other available information such as the social relations among users,\nthereby providing the recommender system with a better initialisation compared\nwith solely relying on the user interaction data. We propose a novel\nrecommendation model, the Social-aware Gaussian Pre-trained model (SGP), which\nencodes the user social relations and interaction data at the pre-training\nstage in a Graph Neural Network (GNN). Afterwards, in the subsequent\nfine-tuning stage, our SGP model adopts a Gaussian Mixture Model (GMM) to\nfactorise these pre-trained embeddings for further training, thereby benefiting\nthe cold-start users from these pre-built social relations. Our extensive\nexperiments on three public datasets show that, in comparison to 16 competitive\nbaselines, our SGP model significantly outperforms the best baseline by upto\n7.7% in terms of NDCG@10. In addition, we show that SGP permits to effectively\nalleviate the cold-start problem, especially when users newly register to the\nsystem through their friends' suggestions.\n","authors":["Siwei Liu","Xi Wang","Craig Macdonald","Iadh Ounis"],"pdf_url":"https://arxiv.org/pdf/2311.15790v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2311.15716v1","updated":"2023-11-27T10:59:16Z","published":"2023-11-27T10:59:16Z","title":"Justifiable Artificial Intelligence: Engineering Large Language Models\n for Legal Applications","summary":" In this work, I discuss how Large Language Models can be applied in the legal\ndomain, circumventing their current drawbacks. Despite their large success and\nacceptance, their lack of explainability hinders legal experts to trust in\ntheir output, and this happens rightfully so. However, in this paper, I argue\nin favor of a new view, Justifiable Artificial Intelligence, instead of\nfocusing on Explainable Artificial Intelligence. I discuss in this paper how\ngaining evidence for and against a Large Language Model's output may make their\ngenerated texts more trustworthy - or hold them accountable for misinformation.\n","authors":["Sabine Wehnert"],"pdf_url":"https://arxiv.org/pdf/2311.15716v1.pdf","comment":"12 pages, 2 figures"},{"id":"http://arxiv.org/abs/2307.10230v3","updated":"2023-11-27T10:38:30Z","published":"2023-07-15T11:49:43Z","title":"Prompt Tuning on Graph-augmented Low-resource Text Classification","summary":" Text classification is a fundamental problem in information retrieval with\nmany real-world applications, such as predicting the topics of online articles\nand the categories of e-commerce product descriptions. However, low-resource\ntext classification, with no or few labeled samples, presents a serious concern\nfor supervised learning. Meanwhile, many text data are inherently grounded on a\nnetwork structure, such as a hyperlink/citation network for online articles,\nand a user-item purchase network for e-commerce products. These graph\nstructures capture rich semantic relationships, which can potentially augment\nlow-resource text classification. In this paper, we propose a novel model\ncalled Graph-Grounded Pre-training and Prompting (G2P2) to address low-resource\ntext classification in a two-pronged approach. During pre-training, we propose\nthree graph interaction-based contrastive strategies to jointly pre-train a\ngraph-text model; during downstream classification, we explore handcrafted\ndiscrete prompts and continuous prompt tuning for the jointly pre-trained model\nto achieve zero- and few-shot classification, respectively. Moreover, we\nexplore the possibility of employing continuous prompt tuning for zero-shot\ninference. Specifically, we aim to generalize continuous prompts to unseen\nclasses while leveraging a set of base classes. To this end, we extend G2P2\ninto G2P2$^*$, hinging on a new architecture of conditional prompt tuning.\nExtensive experiments on four real-world datasets demonstrate the strength of\nG2P2 in zero- and few-shot low-resource text classification tasks, and\nillustrate the advantage of G2P2$^*$ in dealing with unseen classes.\n","authors":["Zhihao Wen","Yuan Fang"],"pdf_url":"https://arxiv.org/pdf/2307.10230v3.pdf","comment":"14 pages, journal under review. arXiv admin note: substantial text\n overlap with arXiv:2305.03324"},{"id":"http://arxiv.org/abs/2311.15689v1","updated":"2023-11-27T10:28:06Z","published":"2023-11-27T10:28:06Z","title":"Two Approaches to the Identity of Processes in BFO","summary":" This paper aims to explore processes and their identity with a focus on the\nupper ontology Basic Formal Ontology (BFO). We begin with a classification\nbased on two basic classes of changes of independent continuants: changes with\nrespect to a single specifically dependent continuant thereof or with respect\nto the spatial region that its parts occupy. We accordingly distinguish two\nkinds of simple processes: specifically dependent continuant changes and\nspatial changes. Next, we investigate a compositional approach to the identity\nof processes: the identity of any process is determined by the identity of the\nsimple processes that compose them. Then, we consider a causal approach to the\nidentity of processes with recourse to a dispositional view of processes\naccording to which any process is a realization of some disposition. We also\nexamine assumptions on which these two approaches to the identity of processes\nare based.\n","authors":["Fumiaki Toyoshima","Adrien Barton"],"pdf_url":"https://arxiv.org/pdf/2311.15689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15578v1","updated":"2023-11-27T07:11:47Z","published":"2023-11-27T07:11:47Z","title":"Experimental Analysis of Large-scale Learnable Vector Storage\n Compression","summary":" Learnable embedding vector is one of the most important applications in\nmachine learning, and is widely used in various database-related domains.\nHowever, the high dimensionality of sparse data in recommendation tasks and the\nhuge volume of corpus in retrieval-related tasks lead to a large memory\nconsumption of the embedding table, which poses a great challenge to the\ntraining and deployment of models. Recent research has proposed various methods\nto compress the embeddings at the cost of a slight decrease in model quality or\nthe introduction of other overheads. Nevertheless, the relative performance of\nthese methods remains unclear. Existing experimental comparisons only cover a\nsubset of these methods and focus on limited metrics. In this paper, we perform\na comprehensive comparative analysis and experimental evaluation of embedding\ncompression. We introduce a new taxonomy that categorizes these techniques\nbased on their characteristics and methodologies, and further develop a modular\nbenchmarking framework that integrates 14 representative methods. Under a\nuniform test environment, our benchmark fairly evaluates each approach,\npresents their strengths and weaknesses under different memory budgets, and\nrecommends the best method based on the use case. In addition to providing\nuseful guidelines, our study also uncovers the limitations of current methods\nand suggests potential directions for future research.\n","authors":["Hailin Zhang","Penghao Zhao","Xupeng Miao","Yingxia Shao","Zirui Liu","Tong Yang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2311.15578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15564v1","updated":"2023-11-27T06:22:57Z","published":"2023-11-27T06:22:57Z","title":"Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval","summary":" Neural 'dense' retrieval models are state of the art for many datasets,\nhowever these models often exhibit limited domain transfer ability. Existing\napproaches to adaptation are unwieldy, such as requiring explicit supervision,\ncomplex model architectures, or massive external models. We present\n$\\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage\nretrieval in zero-shot settings. Our technique follows a straightforward loop:\na dense retriever learns from supervision signals provided by a reranker, and\nsubsequently, the reranker is updated based on feedback from the improved\nretriever. By iterating this loop, the two components mutually enhance one\nanother's performance. Experimental results demonstrate that our unsupervised\n$\\texttt{ABEL}$ model outperforms both leading supervised and unsupervised\nretrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation\nabilities to tasks and domains that were unseen during training. By either\nfine-tuning $\\texttt{ABEL}$ on labelled data or integrating it with existing\nsupervised dense retrievers, we achieve state-of-the-art\nresults.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/BootSwitch}.}\n","authors":["Fan Jiang","Qiongkai Xu","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15564v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.15563v1","updated":"2023-11-27T06:19:50Z","published":"2023-11-27T06:19:50Z","title":"Noisy Self-Training with Synthetic Queries for Dense Retrieval","summary":" Although existing neural retrieval models reveal promising results when\ntraining data is abundant and the performance keeps improving as training data\nincreases, collecting high-quality annotated data is prohibitively costly. To\nthis end, we introduce a novel noisy self-training framework combined with\nsynthetic queries, showing that neural retrievers can be improved in a\nself-evolution manner with no reliance on any external models. Experimental\nresults show that our method improves consistently over existing methods on\nboth general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval\nbenchmarks. Extra analysis on low-resource settings reveals that our method is\ndata efficient and outperforms competitive baselines, with as little as 30% of\nlabelled training data. Further extending the framework for reranker training\ndemonstrates that the proposed method is general and yields additional gains on\ntasks of diverse domains.\\footnote{Source code is available at\n\\url{https://github.com/Fantabulous-J/Self-Training-DPR}}\n","authors":["Fan Jiang","Tom Drummond","Trevor Cohn"],"pdf_url":"https://arxiv.org/pdf/2311.15563v1.pdf","comment":"Accepted by EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2311.13534v2","updated":"2023-11-27T02:52:46Z","published":"2023-11-22T17:14:54Z","title":"LM-Cocktail: Resilient Tuning of Language Models via Model Merging","summary":" The pre-trained language models are continually fine-tuned to better support\ndownstream applications. However, this operation may result in significant\nperformance degeneration on general tasks beyond the targeted domain. To\novercome this problem, we propose a novel method which enables the fine-tuned\nmodel to stay resilient in general perspectives. Our method is conducted in the\nform of model merging (namely LM-Cocktail), where the fine-tuned language model\nis merged with the pre-trained base model or the peer models from other domains\nthrough weighted average. Despite simplicity, LM-Cocktail is surprisingly\neffective: the resulted model is able to achieve a strong empirical performance\nin the whole scope of general tasks while preserving a superior capacity in its\ntargeted domain. We conduct comprehensive experiments with LLama and BGE model\non popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the\nefficacy of our proposed method. The code and checkpoints are available at\nhttps://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail.\n","authors":["Shitao Xiao","Zheng Liu","Peitian Zhang","Xingrun Xing"],"pdf_url":"https://arxiv.org/pdf/2311.13534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15493v1","updated":"2023-11-27T02:30:39Z","published":"2023-11-27T02:30:39Z","title":"UFIN: Universal Feature Interaction Network for Multi-Domain\n Click-Through Rate Prediction","summary":" Click-Through Rate (CTR) prediction, which aims to estimate the probability\nof a user clicking on an item, is a key task in online advertising. Numerous\nexisting CTR models concentrate on modeling the feature interactions within a\nsolitary domain, thereby rendering them inadequate for fulfilling the\nrequisites of multi-domain recommendations in real industrial scenarios. Some\nrecent approaches propose intricate architectures to enhance knowledge sharing\nand augment model training across multiple domains. However, these approaches\nencounter difficulties when being transferred to new recommendation domains,\nowing to their reliance on the modeling of ID features (e.g., item id). To\naddress the above issue, we propose the Universal Feature Interaction Network\n(UFIN) approach for CTR prediction. UFIN exploits textual data to learn\nuniversal feature interactions that can be effectively transferred across\ndiverse domains. For learning universal feature representations, we regard the\ntext and feature as two different modalities and propose an encoder-decoder\nnetwork founded on a Large Language Model (LLM) to enforce the transfer of data\nfrom the text modality to the feature modality. Building upon the above\nfoundation, we further develop a mixtureof-experts (MoE) enhanced adaptive\nfeature interaction model to learn transferable collaborative patterns across\nmultiple domains. Furthermore, we propose a multi-domain knowledge distillation\nframework to enhance feature interaction learning. Based on the above methods,\nUFIN can effectively bridge the semantic gap to learn common knowledge across\nvarious domains, surpassing the constraints of ID-based models. Extensive\nexperiments conducted on eight datasets show the effectiveness of UFIN, in both\nmultidomain and cross-platform settings. Our code is available at\nhttps://github.com/RUCAIBox/UFIN.\n","authors":["Zhen Tian","Changwang Zhang","Wayne Xin Zhao","Xin Zhao","Ji-Rong Wen","Zhao Cao"],"pdf_url":"https://arxiv.org/pdf/2311.15493v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2311.16102v1","updated":"2023-11-27T18:59:53Z","published":"2023-11-27T18:59:53Z","title":"Test-time Adaptation of Discriminative Models via Diffusion Generative\n Feedback","summary":" The advancements in generative modeling, particularly the advent of diffusion\nmodels, have sparked a fundamental question: how can these models be\neffectively used for discriminative tasks? In this work, we find that\ngenerative models can be great test-time adapters for discriminative models.\nOur method, Diffusion-TTA, adapts pre-trained discriminative models such as\nimage classifiers, segmenters and depth predictors, to each unlabelled example\nin the test set using generative feedback from a diffusion model. We achieve\nthis by modulating the conditioning of the diffusion model using the output of\nthe discriminative model. We then maximize the image likelihood objective by\nbackpropagating the gradients to discriminative model's parameters. We show\nDiffusion-TTA significantly enhances the accuracy of various large-scale\npre-trained discriminative models, such as, ImageNet classifiers, CLIP models,\nimage pixel labellers and image depth predictors. Diffusion-TTA outperforms\nexisting test-time adaptation methods, including TTT-MAE and TENT, and\nparticularly shines in online adaptation setups, where the discriminative model\nis continually adapted to each example in the test set. We provide access to\ncode, results, and visualizations on our website:\nhttps://diffusion-tta.github.io/.\n","authors":["Mihir Prabhudesai","Tsung-Wei Ke","Alexander C. Li","Deepak Pathak","Katerina Fragkiadaki"],"pdf_url":"https://arxiv.org/pdf/2311.16102v1.pdf","comment":"Accepted at NeurIPS 2023 Webpage with Code:\n https://diffusion-tta.github.io/"},{"id":"http://arxiv.org/abs/2311.16101v1","updated":"2023-11-27T18:59:42Z","published":"2023-11-27T18:59:42Z","title":"How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for\n Vision LLMs","summary":" This work focuses on the potential of Vision LLMs (VLLMs) in visual\nreasoning. Different from prior studies, we shift our focus from evaluating\nstandard performance to introducing a comprehensive safety evaluation suite,\ncovering both out-of-distribution (OOD) generalization and adversarial\nrobustness. For the OOD evaluation, we present two novel VQA datasets, each\nwith one variant, designed to test model performance under challenging\nconditions. In exploring adversarial robustness, we propose a straightforward\nattack strategy for misleading VLLMs to produce visual-unrelated responses.\nMoreover, we assess the efficacy of two jailbreaking strategies, targeting\neither the vision or language component of VLLMs. Our evaluation of 21 diverse\nmodels, ranging from open-source VLLMs to GPT-4V, yields interesting\nobservations: 1) Current VLLMs struggle with OOD texts but not images, unless\nthe visual information is limited; and 2) These VLLMs can be easily misled by\ndeceiving vision encoders only, and their vision-language training often\ncompromise safety protocols. We release this safety evaluation suite at\nhttps://github.com/UCSC-VLAA/vllm-safety-benchmark.\n","authors":["Haoqin Tu","Chenhang Cui","Zijun Wang","Yiyang Zhou","Bingchen Zhao","Junlin Han","Wangchunshu Zhou","Huaxiu Yao","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2311.16101v1.pdf","comment":"H.T., C.C., and Z.W. contribute equally. Work done during H.T. and\n Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC"},{"id":"http://arxiv.org/abs/2311.16098v1","updated":"2023-11-27T18:59:25Z","published":"2023-11-27T18:59:25Z","title":"On Bringing Robots Home","summary":" Throughout history, we have successfully integrated various machines into our\nhomes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few\nrecent examples. However, these machines excel at performing only a single task\neffectively. The concept of a \"generalist machine\" in homes - a domestic\nassistant that can adapt and learn from our needs, all while remaining\ncost-effective - has long been a goal in robotics that has been steadily\npursued for decades. In this work, we initiate a large-scale effort towards\nthis goal by introducing Dobb-E, an affordable yet versatile general-purpose\nsystem for learning robotic manipulation within household settings. Dobb-E can\nlearn a new task with only five minutes of a user showing it how to do it,\nthanks to a demonstration collection tool (\"The Stick\") we built out of cheap\nparts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of\nNew York City, and train Home Pretrained Representations (HPR). Then, in a\nnovel home environment, with five minutes of demonstrations and fifteen minutes\nof adapting the HPR model, we show that Dobb-E can reliably solve the task on\nthe Stretch, a mobile robot readily available on the market. Across roughly 30\ndays of experimentation in homes of New York City and surrounding areas, we\ntest our system in 10 homes, with a total of 109 tasks in different\nenvironments, and finally achieve a success rate of 81%. Beyond success\npercentages, our experiments reveal a plethora of unique challenges absent or\nignored in lab robotics. These range from effects of strong shadows, to\nvariable demonstration quality by non-expert users. With the hope of\naccelerating research on home robots, and eventually seeing robot butlers in\nevery home, we open-source Dobb-E software stack and models, our data, and our\nhardware designs at https://dobb-e.com\n","authors":["Nur Muhammad Mahi Shafiullah","Anant Rai","Haritheja Etukuru","Yiqian Liu","Ishan Misra","Soumith Chintala","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2311.16098v1.pdf","comment":"Project website and videos are available at https://dobb-e.com,\n technical documentation for getting started is available at\n https://docs.dobb-e.com, and code is released at\n https://github.com/notmahi/dobb-e"},{"id":"http://arxiv.org/abs/2311.16093v1","updated":"2023-11-27T18:58:34Z","published":"2023-11-27T18:58:34Z","title":"Have we built machines that think like people?","summary":" A chief goal of artificial intelligence is to build machines that think like\npeople. Yet it has been argued that deep neural network architectures fail to\naccomplish this. Researchers have asserted these models' limitations in the\ndomains of causal reasoning, intuitive physics, and intuitive psychology. Yet\nrecent advancements, namely the rise of large language models, particularly\nthose designed for visual processing, have rekindled interest in the potential\nto emulate human-like cognitive abilities. This paper evaluates the current\nstate of vision-based large language models in the domains of intuitive\nphysics, causal reasoning, and intuitive psychology. Through a series of\ncontrolled experiments, we investigate the extent to which these modern models\ngrasp complex physical interactions, causal relationships, and intuitive\nunderstanding of others' preferences. Our findings reveal that, while these\nmodels demonstrate a notable proficiency in processing and interpreting visual\ndata, they still fall short of human capabilities in these areas. The models\nexhibit a rudimentary understanding of physical laws and causal relationships,\nbut their performance is hindered by a lack of deeper insights-a key aspect of\nhuman cognition. Furthermore, in tasks requiring an intuitive theory of mind,\nthe models fail altogether. Our results emphasize the need for integrating more\nrobust mechanisms for understanding causality, physical dynamics, and social\ncognition into modern-day, vision-based language models, and point out the\nimportance of cognitively-inspired benchmarks.\n","authors":["Luca M. Schulze Buschoff","Elif Akata","Matthias Bethge","Eric Schulz"],"pdf_url":"https://arxiv.org/pdf/2311.16093v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16091v1","updated":"2023-11-27T18:57:42Z","published":"2023-11-27T18:57:42Z","title":"Interactive Autonomous Navigation with Internal State Inference and\n Interactivity Estimation","summary":" Deep reinforcement learning (DRL) provides a promising way for intelligent\nagents (e.g., autonomous vehicles) to learn to navigate complex scenarios.\nHowever, DRL with neural networks as function approximators is typically\nconsidered a black box with little explainability and often suffers from\nsuboptimal performance, especially for autonomous navigation in highly\ninteractive multi-agent environments. To address these issues, we propose three\nauxiliary tasks with spatio-temporal relational reasoning and integrate them\ninto the standard DRL framework, which improves the decision making performance\nand provides explainable intermediate indicators. We propose to explicitly\ninfer the internal states (i.e., traits and intentions) of surrounding agents\n(e.g., human drivers) as well as to predict their future trajectories in the\nsituations with and without the ego agent through counterfactual reasoning.\nThese auxiliary tasks provide additional supervision signals to infer the\nbehavior patterns of other interactive agents. Multiple variants of framework\nintegration strategies are compared. We also employ a spatio-temporal graph\nneural network to encode relations between dynamic entities, which enhances\nboth internal state inference and decision making of the ego agent. Moreover,\nwe propose an interactivity estimation mechanism based on the difference\nbetween predicted trajectories in these two situations, which indicates the\ndegree of influence of the ego agent on other agents. To validate the proposed\nmethod, we design an intersection driving simulator based on the Intelligent\nIntersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our\napproach achieves robust and state-of-the-art performance in terms of standard\nevaluation metrics and provides explainable intermediate indicators (i.e.,\ninternal states, and interactivity scores) for decision making.\n","authors":["Jiachen Li","David Isele","Kanghoon Lee","Jinkyoo Park","Kikuo Fujimura","Mykel J. Kochenderfer"],"pdf_url":"https://arxiv.org/pdf/2311.16091v1.pdf","comment":"18 pages, 14 figures"},{"id":"http://arxiv.org/abs/2311.16086v1","updated":"2023-11-27T18:56:03Z","published":"2023-11-27T18:56:03Z","title":"MAST: Model-Agnostic Sparsified Training","summary":" We introduce a novel optimization problem formulation that departs from the\nconventional way of minimizing machine learning model loss as a black-box\nfunction. Unlike traditional formulations, the proposed approach explicitly\nincorporates an initially pre-trained model and random sketch operators,\nallowing for sparsification of both the model and gradient during training. We\nestablish insightful properties of the proposed objective function and\nhighlight its connections to the standard formulation. Furthermore, we present\nseveral variants of the Stochastic Gradient Descent (SGD) method adapted to the\nnew problem formulation, including SGD with general sampling, a distributed\nversion, and SGD with variance reduction techniques. We achieve tighter\nconvergence rates and relax assumptions, bridging the gap between theoretical\nprinciples and practical applications, covering several important techniques\nsuch as Dropout and Sparse training. This work presents promising opportunities\nto enhance the theoretical understanding of model training through a\nsparsification-aware optimization approach.\n","authors":["Yury Demidovich","Grigory Malinovsky","Egor Shulgin","Peter Richtárik"],"pdf_url":"https://arxiv.org/pdf/2311.16086v1.pdf","comment":"58 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.16082v1","updated":"2023-11-27T18:52:25Z","published":"2023-11-27T18:52:25Z","title":"Transformer-QEC: Quantum Error Correction Code Decoding with\n Transferable Transformers","summary":" Quantum computing has the potential to solve problems that are intractable\nfor classical systems, yet the high error rates in contemporary quantum devices\noften exceed tolerable limits for useful algorithm execution. Quantum Error\nCorrection (QEC) mitigates this by employing redundancy, distributing quantum\ninformation across multiple data qubits and utilizing syndrome qubits to\nmonitor their states for errors. The syndromes are subsequently interpreted by\na decoding algorithm to identify and correct errors in the data qubits. This\ntask is complex due to the multiplicity of error sources affecting both data\nand syndrome qubits as well as syndrome extraction operations. Additionally,\nidentical syndromes can emanate from different error sources, necessitating a\ndecoding algorithm that evaluates syndromes collectively. Although machine\nlearning (ML) decoders such as multi-layer perceptrons (MLPs) and convolutional\nneural networks (CNNs) have been proposed, they often focus on local syndrome\nregions and require retraining when adjusting for different code distances. We\nintroduce a transformer-based QEC decoder which employs self-attention to\nachieve a global receptive field across all input syndromes. It incorporates a\nmixed loss training approach, combining both local physical error and global\nparity label losses. Moreover, the transformer architecture's inherent\nadaptability to variable-length inputs allows for efficient transfer learning,\nenabling the decoder to adapt to varying code distances without retraining.\n Evaluation on six code distances and ten different error configurations\ndemonstrates that our model consistently outperforms non-ML decoders, such as\nUnion Find (UF) and Minimum Weight Perfect Matching (MWPM), and other ML\ndecoders, thereby achieving best logical error rates. Moreover, the transfer\nlearning can save over 10x of training cost.\n","authors":["Hanrui Wang","Pengyu Liu","Kevin Shao","Dantong Li","Jiaqi Gu","David Z. Pan","Yongshan Ding","Song Han"],"pdf_url":"https://arxiv.org/pdf/2311.16082v1.pdf","comment":"Accepted to ICCAD 2023, FAST ML for Science Workshop; 7 pages, 8\n figures"},{"id":"http://arxiv.org/abs/2311.16080v1","updated":"2023-11-27T18:50:37Z","published":"2023-11-27T18:50:37Z","title":"XLB: Distributed Multi-GPU Lattice Boltzmann Simulation Framework for\n Differentiable Scientific Machine Learning","summary":" The lattice Boltzmann method (LBM) has emerged as a prominent technique for\nsolving fluid dynamics problems due to its algorithmic potential for\ncomputational scalability. We introduce XLB framework, a Python-based\ndifferentiable LBM library which harnesses the capabilities of the JAX\nframework. The architecture of XLB is predicated upon ensuring accessibility,\nextensibility, and computational performance, enabling scaling effectively\nacross CPU, multi-GPU, and distributed multi-GPU systems. The framework can be\nreadily augmented with novel boundary conditions, collision models, or\nsimulation capabilities. XLB offers the unique advantage of integration with\nJAX's extensive machine learning echosystem, and the ability to utilize\nautomatic differentiation for tackling physics-based machine learning,\noptimization, and inverse problems. XLB has been successfully scaled to handle\nsimulations with billions of cells, achieving giga-scale lattice updates per\nsecond. XLB is released under the permissive Apache-2.0 license and is\navailable on GitHub at https://github.com/Autodesk/XLB.\n","authors":["Mohammadmehdi Ataei","Hesam Salehipour"],"pdf_url":"https://arxiv.org/pdf/2311.16080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16079v1","updated":"2023-11-27T18:49:43Z","published":"2023-11-27T18:49:43Z","title":"MEDITRON-70B: Scaling Medical Pretraining for Large Language Models","summary":" Large language models (LLMs) can potentially democratize access to medical\nknowledge. While many efforts have been made to harness and improve LLMs'\nmedical knowledge and reasoning capacities, the resulting models are either\nclosed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters),\nwhich restricts their abilities. In this work, we improve access to large-scale\nmedical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B\nparameters adapted to the medical domain. MEDITRON builds on Llama-2 (through\nour adaptation of Nvidia's Megatron-LM distributed trainer), and extends\npretraining on a comprehensively curated medical corpus, including selected\nPubMed articles, abstracts, and internationally-recognized medical guidelines.\nEvaluations using four major medical benchmarks show significant performance\ngains over several state-of-the-art baselines before and after task-specific\nfinetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the\nbest public baseline in its parameter class and 3% over the strongest baseline\nwe finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B\noutperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of\nMed-PaLM-2. We release our code for curating the medical pretraining corpus and\nthe MEDITRON model weights to drive open-source development of more capable\nmedical LLMs.\n","authors":["Zeming Chen","Alejandro Hernández Cano","Angelika Romanou","Antoine Bonnet","Kyle Matoba","Francesco Salvi","Matteo Pagliardini","Simin Fan","Andreas Köpf","Amirkeivan Mohtashami","Alexandre Sallinen","Alireza Sakhaeirad","Vinitra Swamy","Igor Krawczuk","Deniz Bayazit","Axel Marmet","Syrielle Montariol","Mary-Anne Hartley","Martin Jaggi","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2311.16079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.14309v2","updated":"2023-11-27T18:48:33Z","published":"2022-11-25T18:59:53Z","title":"FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from\n Video Observations","summary":" We present a generative approach to forecast long-term future human behavior\nin 3D, requiring only weak supervision from readily available 2D human action\ndata. This is a fundamental task enabling many downstream applications. The\nrequired ground-truth data is hard to capture in 3D (mocap suits, expensive\nsetups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our\nmethod to only require 2D RGB data while being able to generate 3D human motion\nsequences. We use a differentiable 2D projection scheme in an autoregressive\nmanner for weak supervision, and an adversarial loss for 3D regularization. Our\nmethod predicts long and complex behavior sequences (e.g. cooking, assembly)\nconsisting of multiple sub-actions. We tackle this in a semantically\nhierarchical manner, jointly predicting high-level coarse action labels\ntogether with their low-level fine-grained realizations as characteristic 3D\nhuman poses. We observe that these two action representations are coupled in\nnature, and joint prediction benefits both action and pose forecasting. Our\nexperiments demonstrate the complementary nature of joint action and 3D pose\nprediction: our joint approach outperforms each task treated individually,\nenables robust longer-term sequence prediction, and outperforms alternative\napproaches to forecast actions and characteristic 3D poses.\n","authors":["Christian Diller","Thomas Funkhouser","Angela Dai"],"pdf_url":"https://arxiv.org/pdf/2211.14309v2.pdf","comment":"Project Page: https://future-human-3d.christian-diller.de/ Video:\n https://www.youtube.com/watch?v=18du85YFXL0"},{"id":"http://arxiv.org/abs/2311.16065v1","updated":"2023-11-27T18:32:08Z","published":"2023-11-27T18:32:08Z","title":"A Survey on Vulnerability of Federated Learning: A Learning Algorithm\n Perspective","summary":" This review paper takes a comprehensive look at malicious attacks against FL,\ncategorizing them from new perspectives on attack origins and targets, and\nproviding insights into their methodology and impact. In this survey, we focus\non threat models targeting the learning process of FL systems. Based on the\nsource and target of the attack, we categorize existing threat models into four\ntypes, Data to Model (D2M), Model to Data (M2D), Model to Model (M2M) and\ncomposite attacks. For each attack type, we discuss the defense strategies\nproposed, highlighting their effectiveness, assumptions and potential areas for\nimprovement. Defense strategies have evolved from using a singular metric to\nexcluding malicious clients, to employing a multifaceted approach examining\nclient models at various phases. In this survey paper, our research indicates\nthat the to-learn data, the learning gradients, and the learned model at\ndifferent stages all can be manipulated to initiate malicious attacks that\nrange from undermining model performance, reconstructing private local data,\nand to inserting backdoors. We have also seen these threat are becoming more\ninsidious. While earlier studies typically amplified malicious gradients,\nrecent endeavors subtly alter the least significant weights in local models to\nbypass defense measures. This literature review provides a holistic\nunderstanding of the current FL threat landscape and highlights the importance\nof developing robust, efficient, and privacy-preserving defenses to ensure the\nsafe and trusted adoption of FL in real-world applications.\n","authors":["Xianghua Xie","Chen Hu","Hanchi Ren","Jingjing Deng"],"pdf_url":"https://arxiv.org/pdf/2311.16065v1.pdf","comment":"https://github.com/Rand2AI/Awesome-Vulnerability-of-Federated-Learning"},{"id":"http://arxiv.org/abs/2311.14078v2","updated":"2023-11-27T18:31:15Z","published":"2023-11-23T16:12:00Z","title":"Machine learning-based decentralized TDMA for VLC IoT networks","summary":" In this paper, a machine learning-based decentralized time division multiple\naccess (TDMA) algorithm for visible light communication (VLC) Internet of\nThings (IoT) networks is proposed. The proposed algorithm is based on\nQ-learning, a reinforcement learning algorithm. This paper considers a\ndecentralized condition in which there is no coordinator node for sending\nsynchronization frames and assigning transmission time slots to other nodes.\nThe proposed algorithm uses a decentralized manner for synchronization, and\neach node uses the Q-learning algorithm to find the optimal transmission time\nslot for sending data without collisions. The proposed algorithm is implemented\non a VLC hardware system, which had been designed and implemented in our\nlaboratory. Average reward, convergence time, goodput, average delay, and data\npacket size are evaluated parameters. The results show that the proposed\nalgorithm converges quickly and provides collision-free decentralized TDMA for\nthe network. The proposed algorithm is compared with carrier-sense multiple\naccess with collision avoidance (CSMA/CA) algorithm as a potential selection\nfor decentralized VLC IoT networks. The results show that the proposed\nalgorithm provides up to 61% more goodput and up to 49% less average delay than\nCSMA/CA.\n","authors":["Armin Makvandi","Yousef Seifi Kavian"],"pdf_url":"https://arxiv.org/pdf/2311.14078v2.pdf","comment":"This work has been submitted to a journal for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2210.06462v3","updated":"2023-11-27T18:30:14Z","published":"2022-10-12T17:57:58Z","title":"Self-Guided Diffusion Models","summary":" Diffusion models have demonstrated remarkable progress in image generation\nquality, especially when guidance is used to control the generative process.\nHowever, guidance requires a large amount of image-annotation pairs for\ntraining and is thus dependent on their availability, correctness and\nunbiasedness. In this paper, we eliminate the need for such annotation by\ninstead leveraging the flexibility of self-supervision signals to design a\nframework for self-guided diffusion models. By leveraging a feature extraction\nfunction and a self-annotation function, our method provides guidance signals\nat various image granularities: from the level of holistic images to object\nboxes and even segmentation masks. Our experiments on single-label and\nmulti-label image datasets demonstrate that self-labeled guidance always\noutperforms diffusion models without guidance and may even surpass guidance\nbased on ground-truth labels, especially on unbalanced data. When equipped with\nself-supervised box or mask proposals, our method further generates visually\ndiverse yet semantically consistent images, without the need for any class,\nbox, or segment label annotation. Self-guided diffusion is simple, flexible and\nexpected to profit from deployment at scale. Source code will be at:\nhttps://taohu.me/sgdm/\n","authors":["Vincent Tao Hu","David W Zhang","Yuki M. Asano","Gertjan J. Burghouts","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2210.06462v3.pdf","comment":"CVPR 2023"},{"id":"http://arxiv.org/abs/2112.12589v3","updated":"2023-11-27T18:29:31Z","published":"2021-12-20T13:46:39Z","title":"A deep reinforcement learning model for predictive maintenance planning\n of road assets: Integrating LCA and LCCA","summary":" Road maintenance planning is an integral part of road asset management. One\nof the main challenges in Maintenance and Rehabilitation (M&R) practices is to\ndetermine maintenance type and timing. This research proposes a framework using\nReinforcement Learning (RL) based on the Long Term Pavement Performance (LTPP)\ndatabase to determine the type and timing of M&R practices. A predictive DNN\nmodel is first developed in the proposed algorithm, which serves as the\nEnvironment for the RL algorithm. For the Policy estimation of the RL model,\nboth DQN and PPO models are developed. However, PPO has been selected in the\nend due to better convergence and higher sample efficiency. Indicators used in\nthis study are International Roughness Index (IRI) and Rutting Depth (RD).\nInitially, we considered Cracking Metric (CM) as the third indicator, but it\nwas then excluded due to the much fewer data compared to other indicators,\nwhich resulted in lower accuracy of the results. Furthermore, in\ncost-effectiveness calculation (reward), we considered both the economic and\nenvironmental impacts of M&R treatments. Costs and environmental impacts have\nbeen evaluated with paLATE 2.0 software. Our method is tested on a hypothetical\ncase study of a six-lane highway with 23 kilometers length located in Texas,\nwhich has a warm and wet climate. The results propose a 20-year M&R plan in\nwhich road condition remains in an excellent condition range. Because the early\nstate of the road is at a good level of service, there is no need for heavy\nmaintenance practices in the first years. Later, after heavy M&R actions, there\nare several 1-2 years of no need for treatments. All of these show that the\nproposed plan has a logical result. Decision-makers and transportation agencies\ncan use this scheme to conduct better maintenance practices that can prevent\nbudget waste and, at the same time, minimize the environmental impacts.\n","authors":["Moein Latifi","Fateme Golivand Darvishvand","Omid Khandel","Mobin Latifi Nowsoud"],"pdf_url":"https://arxiv.org/pdf/2112.12589v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.08805v3","updated":"2023-11-27T18:24:41Z","published":"2021-11-16T22:16:03Z","title":"Online Estimation and Optimization of Utility-Based Shortfall Risk","summary":" Utility-Based Shortfall Risk (UBSR) is a risk metric that is increasingly\npopular in financial applications, owing to certain desirable properties that\nit enjoys. We consider the problem of estimating UBSR in a recursive setting,\nwhere samples from the underlying loss distribution are available\none-at-a-time. We cast the UBSR estimation problem as a root finding problem,\nand propose stochastic approximation-based estimations schemes. We derive\nnon-asymptotic bounds on the estimation error in the number of samples. We also\nconsider the problem of UBSR optimization within a parameterized class of\nrandom variables. We propose a stochastic gradient descent based algorithm for\nUBSR optimization, and derive non-asymptotic bounds on its convergence.\n","authors":["Vishwajit Hegde","Arvind S. Menon","L. A. Prashanth","Krishna Jagannathan"],"pdf_url":"https://arxiv.org/pdf/2111.08805v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16054v1","updated":"2023-11-27T18:19:07Z","published":"2023-11-27T18:19:07Z","title":"Metric Space Magnitude for Evaluating Unsupervised Representation\n Learning","summary":" The magnitude of a metric space was recently established as a novel\ninvariant, providing a measure of the `effective size' of a space across\nmultiple scales. By capturing both geometrical and topological properties of\ndata, magnitude is poised to address challenges in unsupervised representation\nlearning tasks. We formalise a novel notion of dissimilarity between magnitude\nfunctions of finite metric spaces and use them to derive a quality measure for\ndimensionality reduction tasks. Our measure is provably stable under\nperturbations of the data, can be efficiently calculated, and enables a\nrigorous multi-scale comparison of embeddings. We show the utility of our\nmeasure in an experimental suite that comprises different domains and tasks,\nincluding the comparison of data visualisations.\n","authors":["Katharina Limbeck","Rayna Andreeva","Rik Sarkar","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2311.16054v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16038v1","updated":"2023-11-27T17:59:41Z","published":"2023-11-27T17:59:41Z","title":"OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving","summary":" Understanding how the 3D scene evolves is vital for making decisions in\nautonomous driving. Most existing methods achieve this by predicting the\nmovements of object boxes, which cannot capture more fine-grained scene\ninformation. In this paper, we explore a new framework of learning a world\nmodel, OccWorld, in the 3D Occupancy space to simultaneously predict the\nmovement of the ego car and the evolution of the surrounding scenes. We propose\nto learn a world model based on 3D occupancy rather than 3D bounding boxes and\nsegmentation maps for three reasons: 1) expressiveness. 3D occupancy can\ndescribe the more fine-grained 3D structure of the scene; 2) efficiency. 3D\noccupancy is more economical to obtain (e.g., from sparse LiDAR points). 3)\nversatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the\nmodeling of the world evolution, we learn a reconstruction-based scene\ntokenizer on the 3D occupancy to obtain discrete scene tokens to describe the\nsurrounding scenes. We then adopt a GPT-like spatial-temporal generative\ntransformer to generate subsequent scene and ego tokens to decode the future\noccupancy and ego trajectory. Extensive experiments on the widely used nuScenes\nbenchmark demonstrate the ability of OccWorld to effectively model the\nevolution of the driving scenes. OccWorld also produces competitive planning\nresults without using instance and map supervision. Code:\nhttps://github.com/wzzheng/OccWorld.\n","authors":["Wenzhao Zheng","Weiliang Chen","Yuanhui Huang","Borui Zhang","Yueqi Duan","Jiwen Lu"],"pdf_url":"https://arxiv.org/pdf/2311.16038v1.pdf","comment":"Code is available at: https://github.com/wzzheng/OccWorld"},{"id":"http://arxiv.org/abs/2308.00709v2","updated":"2023-11-27T17:57:18Z","published":"2023-07-28T22:52:15Z","title":"DeepTSF: Codeless machine learning operations for time series\n forecasting","summary":" This paper presents DeepTSF, a comprehensive machine learning operations\n(MLOps) framework aiming to innovate time series forecasting through workflow\nautomation and codeless modeling. DeepTSF automates key aspects of the ML\nlifecycle, making it an ideal tool for data scientists and MLops engineers\nengaged in machine learning (ML) and deep learning (DL)-based forecasting.\nDeepTSF empowers users with a robust and user-friendly solution, while it is\ndesigned to seamlessly integrate with existing data analysis workflows,\nproviding enhanced productivity and compatibility. The framework offers a\nfront-end user interface (UI) suitable for data scientists, as well as other\nhigher-level stakeholders, enabling comprehensive understanding through\ninsightful visualizations and evaluation metrics. DeepTSF also prioritizes\nsecurity through identity management and access authorization mechanisms. The\napplication of DeepTSF in real-life use cases of the I-NERGY project has\nalready proven DeepTSF's efficacy in DL-based load forecasting, showcasing its\nsignificant added value in the electrical power and energy systems domain.\n","authors":["Sotiris Pelekis","Evangelos Karakolis","Theodosios Pountridis","George Kormpakis","George Lampropoulos","Spiros Mouzakitis","Dimitris Askounis"],"pdf_url":"https://arxiv.org/pdf/2308.00709v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16035v1","updated":"2023-11-27T17:55:50Z","published":"2023-11-27T17:55:50Z","title":"RobustState: Boosting Fidelity of Quantum State Preparation via\n Noise-Aware Variational Training","summary":" Quantum state preparation, a crucial subroutine in quantum computing,\ninvolves generating a target quantum state from initialized qubits. Arbitrary\nstate preparation algorithms can be broadly categorized into arithmetic\ndecomposition (AD) and variational quantum state preparation (VQSP). AD employs\na predefined procedure to decompose the target state into a series of gates,\nwhereas VQSP iteratively tunes ansatz parameters to approximate target state.\nVQSP is particularly apt for Noisy-Intermediate Scale Quantum (NISQ) machines\ndue to its shorter circuits. However, achieving noise-robust parameter\noptimization still remains challenging.\n We present RobustState, a novel VQSP training methodology that combines high\nrobustness with high training efficiency. The core idea involves utilizing\nmeasurement outcomes from real machines to perform back-propagation through\nclassical simulators, thus incorporating real quantum noise into gradient\ncalculations. RobustState serves as a versatile, plug-and-play technique\napplicable for training parameters from scratch or fine-tuning existing\nparameters to enhance fidelity on target machines. It is adaptable to various\nansatzes at both gate and pulse levels and can even benefit other variational\nalgorithms, such as variational unitary synthesis.\n Comprehensive evaluation of RobustState on state preparation tasks for 4\ndistinct quantum algorithms using 10 real quantum machines demonstrates a\ncoherent error reduction of up to 7.1 $\\times$ and state fidelity improvement\nof up to 96\\% and 81\\% for 4-Q and 5-Q states, respectively. On average,\nRobustState improves fidelity by 50\\% and 72\\% for 4-Q and 5-Q states compared\nto baseline approaches.\n","authors":["Hanrui Wang","Yilian Liu","Pengyu Liu","Jiaqi Gu","Zirui Li","Zhiding Liang","Jinglei Cheng","Yongshan Ding","Xuehai Qian","Yiyu Shi","David Z. Pan","Frederic T. Chong","Song Han"],"pdf_url":"https://arxiv.org/pdf/2311.16035v1.pdf","comment":"Accepted to FASTML @ ICCAD 2023. 14 pages, 20 figures"},{"id":"http://arxiv.org/abs/2311.16030v1","updated":"2023-11-27T17:50:14Z","published":"2023-11-27T17:50:14Z","title":"Machine Learning-Enhanced Aircraft Landing Scheduling under\n Uncertainties","summary":" This paper addresses aircraft delays, emphasizing their impact on safety and\nfinancial losses. To mitigate these issues, an innovative machine learning\n(ML)-enhanced landing scheduling methodology is proposed, aiming to improve\nautomation and safety. Analyzing flight arrival delay scenarios reveals strong\nmultimodal distributions and clusters in arrival flight time durations. A\nmulti-stage conditional ML predictor enhances separation time prediction based\non flight events. ML predictions are then integrated as safety constraints in a\ntime-constrained traveling salesman problem formulation, solved using\nmixed-integer linear programming (MILP). Historical flight recordings and model\npredictions address uncertainties between successive flights, ensuring\nreliability. The proposed method is validated using real-world data from the\nAtlanta Air Route Traffic Control Center (ARTCC ZTL). Case studies demonstrate\nan average 17.2% reduction in total landing time compared to the\nFirst-Come-First-Served (FCFS) rule. Unlike FCFS, the proposed methodology\nconsiders uncertainties, instilling confidence in scheduling. The study\nconcludes with remarks and outlines future research directions.\n","authors":["Yutian Pang","Peng Zhao","Jueming Hu","Yongming Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16026v1","updated":"2023-11-27T17:40:02Z","published":"2023-11-27T17:40:02Z","title":"A Neural Framework for Generalized Causal Sensitivity Analysis","summary":" Unobserved confounding is common in many applications, making causal\ninference from observational data challenging. As a remedy, causal sensitivity\nanalysis is an important tool to draw causal conclusions under unobserved\nconfounding with mathematical guarantees. In this paper, we propose NeuralCSA,\na neural framework for generalized causal sensitivity analysis. Unlike previous\nwork, our framework is compatible with (i) a large class of sensitivity models,\nincluding the marginal sensitivity model, f-sensitivity models, and Rosenbaum's\nsensitivity model; (ii) different treatment types (i.e., binary and\ncontinuous); and (iii) different causal queries, including (conditional)\naverage treatment effects and simultaneous effects on multiple outcomes. The\ngenerality of \\frameworkname is achieved by learning a latent distribution\nshift that corresponds to a treatment intervention using two conditional\nnormalizing flows. We provide theoretical guarantees that NeuralCSA is able to\ninfer valid bounds on the causal query of interest and also demonstrate this\nempirically using both simulated and real-world data.\n","authors":["Dennis Frauen","Fergus Imrie","Alicia Curth","Valentyn Melnychuk","Stefan Feuerriegel","Mihaela van der Schaar"],"pdf_url":"https://arxiv.org/pdf/2311.16026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13258v2","updated":"2023-11-27T17:36:19Z","published":"2023-10-20T03:34:31Z","title":"ManiCast: Collaborative Manipulation with Cost-Aware Human Forecasting","summary":" Seamless human-robot manipulation in close proximity relies on accurate\nforecasts of human motion. While there has been significant progress in\nlearning forecast models at scale, when applied to manipulation tasks, these\nmodels accrue high errors at critical transition points leading to degradation\nin downstream planning performance. Our key insight is that instead of\npredicting the most likely human motion, it is sufficient to produce forecasts\nthat capture how future human motion would affect the cost of a robot's plan.\nWe present ManiCast, a novel framework that learns cost-aware human forecasts\nand feeds them to a model predictive control planner to execute collaborative\nmanipulation tasks. Our framework enables fluid, real-time interactions between\na human and a 7-DoF robot arm across a number of real-world tasks such as\nreactive stirring, object handovers, and collaborative table setting. We\nevaluate both the motion forecasts and the end-to-end forecaster-planner system\nagainst a range of learned and heuristic baselines while additionally\ncontributing new datasets. We release our code and datasets at\nhttps://portal-cornell.github.io/manicast/.\n","authors":["Kushal Kedia","Prithwish Dan","Atiksh Bhardwaj","Sanjiban Choudhury"],"pdf_url":"https://arxiv.org/pdf/2310.13258v2.pdf","comment":"CoRL 2023"},{"id":"http://arxiv.org/abs/2311.16021v1","updated":"2023-11-27T17:35:28Z","published":"2023-11-27T17:35:28Z","title":"Scheduling and Communication Schemes for Decentralized Federated\n Learning","summary":" Federated learning (FL) is a distributed machine learning paradigm in which a\nlarge number of clients coordinate with a central server to learn a model\nwithout sharing their own training data. One central server is not enough, due\nto problems of connectivity with clients. In this paper, a decentralized\nfederated learning (DFL) model with the stochastic gradient descent (SGD)\nalgorithm has been introduced, as a more scalable approach to improve the\nlearning performance in a network of agents with arbitrary topology. Three\nscheduling policies for DFL have been proposed for communications between the\nclients and the parallel servers, and the convergence, accuracy, and loss have\nbeen tested in a totally decentralized mplementation of SGD. The experimental\nresults show that the proposed scheduling polices have an impact both on the\nspeed of convergence and in the final global model.\n","authors":["Bahaa-Eldin Ali Abdelghany","Ana Fernández-Vilas","Manuel Fernández-Veiga","Nashwa El-Bendary","Ammar M. Hassan","Walid M. Abdelmoez"],"pdf_url":"https://arxiv.org/pdf/2311.16021v1.pdf","comment":"32nd International Conference on Computer Theory and Applications\n (ICCTA), Alexandria, Egypt, 2022"},{"id":"http://arxiv.org/abs/2203.09659v3","updated":"2023-11-27T17:23:10Z","published":"2022-03-17T23:52:08Z","title":"Low-degree learning and the metric entropy of polynomials","summary":" Let $\\mathscr{F}_{n,d}$ be the class of all functions $f:\\{-1,1\\}^n\\to[-1,1]$\non the $n$-dimensional discrete hypercube of degree at most $d$. In the first\npart of this paper, we prove that any (deterministic or randomized) algorithm\nwhich learns $\\mathscr{F}_{n,d}$ with $L_2$-accuracy $\\varepsilon$ requires at\nleast $\\Omega((1-\\sqrt{\\varepsilon})2^d\\log n)$ queries for large enough $n$,\nthus establishing the sharpness as $n\\to\\infty$ of a recent upper bound of\nEskenazis and Ivanisvili (2021). To do this, we show that the $L_2$-packing\nnumbers $\\mathsf{M}(\\mathscr{F}_{n,d},\\|\\cdot\\|_{L_2},\\varepsilon)$ of the\nconcept class $\\mathscr{F}_{n,d}$ satisfy the two-sided estimate\n$$c(1-\\varepsilon)2^d\\log n \\leq \\log\n\\mathsf{M}(\\mathscr{F}_{n,d},\\|\\cdot\\|_{L_2},\\varepsilon) \\leq \\frac{2^{Cd}\\log\nn}{\\varepsilon^4}$$ for large enough $n$, where $c, C>0$ are universal\nconstants. In the second part of the paper, we present a logarithmic upper\nbound for the randomized query complexity of classes of bounded approximate\npolynomials whose Fourier spectra are concentrated on few subsets. As an\napplication, we prove new estimates for the number of random queries required\nto learn approximate juntas of a given degree, functions with rapidly decaying\nFourier tails and constant depth circuits of given size. Finally, we obtain\nbounds for the number of queries required to learn the polynomial class\n$\\mathscr{F}_{n,d}$ without error in the query and random example models.\n","authors":["Alexandros Eskenazis","Paata Ivanisvili","Lauritz Streck"],"pdf_url":"https://arxiv.org/pdf/2203.09659v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11913v2","updated":"2023-11-27T17:17:39Z","published":"2023-11-20T16:44:18Z","title":"Deep Calibration of Market Simulations using Neural Density Estimators\n and Embedding Networks","summary":" The ability to construct a realistic simulator of financial exchanges,\nincluding reproducing the dynamics of the limit order book, can give insight\ninto many counterfactual scenarios, such as a flash crash, a margin call, or\nchanges in macroeconomic outlook. In recent years, agent-based models have been\ndeveloped that reproduce many features of an exchange, as summarised by a set\nof stylised facts and statistics. However, the ability to calibrate simulators\nto a specific period of trading remains an open challenge. In this work, we\ndevelop a novel approach to the calibration of market simulators by leveraging\nrecent advances in deep learning, specifically using neural density estimators\nand embedding networks. We demonstrate that our approach is able to correctly\nidentify high probability parameter sets, both when applied to synthetic and\nhistorical data, and without reliance on manually selected or weighted\nensembles of stylised facts.\n","authors":["Namid R. Stillman","Rory Baggott","Justin Lyon","Jianfei Zhang","Dingqiu Zhu","Tao Chen","Perukrishnen Vytelingum"],"pdf_url":"https://arxiv.org/pdf/2311.11913v2.pdf","comment":"4th ACM International Conference on AI in Finance (ICAIF 2023)"},{"id":"http://arxiv.org/abs/2211.14400v5","updated":"2023-11-27T17:13:24Z","published":"2022-11-25T23:32:26Z","title":"Optimal Approximation Rates for Deep ReLU Neural Networks on Sobolev and\n Besov Spaces","summary":" Let $\\Omega = [0,1]^d$ be the unit cube in $\\mathbb{R}^d$. We study the\nproblem of how efficiently, in terms of the number of parameters, deep neural\nnetworks with the ReLU activation function can approximate functions in the\nSobolev spaces $W^s(L_q(\\Omega))$ and Besov spaces $B^s_r(L_q(\\Omega))$, with\nerror measured in the $L_p(\\Omega)$ norm. This problem is important when\nstudying the application of neural networks in a variety of fields, including\nscientific computing and signal processing, and has previously been solved only\nwhen $p=q=\\infty$. Our contribution is to provide a complete solution for all\n$1\\leq p,q\\leq \\infty$ and $s > 0$ for which the corresponding Sobolev or Besov\nspace compactly embeds into $L_p$. The key technical tool is a novel\nbit-extraction technique which gives an optimal encoding of sparse vectors.\nThis enables us to obtain sharp upper bounds in the non-linear regime where $p\n> q$. We also provide a novel method for deriving $L_p$-approximation lower\nbounds based upon VC-dimension when $p < \\infty$. Our results show that very\ndeep ReLU networks significantly outperform classical methods of approximation\nin terms of the number of parameters, but that this comes at the cost of\nparameters which are not encodable.\n","authors":["Jonathan W. Siegel"],"pdf_url":"https://arxiv.org/pdf/2211.14400v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16008v1","updated":"2023-11-27T17:02:56Z","published":"2023-11-27T17:02:56Z","title":"Using Decentralized Aggregation for Federated Learning with Differential\n Privacy","summary":" Nowadays, the ubiquitous usage of mobile devices and networks have raised\nconcerns about the loss of control over personal data and research advance\ntowards the trade-off between privacy and utility in scenarios that combine\nexchange communications, big databases and distributed and collaborative (P2P)\nMachine Learning techniques. On the other hand, although Federated Learning\n(FL) provides some level of privacy by retaining the data at the local node,\nwhich executes a local training to enrich a global model, this scenario is\nstill susceptible to privacy breaches as membership inference attacks. To\nprovide a stronger level of privacy, this research deploys an experimental\nenvironment for FL with Differential Privacy (DP) using benchmark datasets. The\nobtained results show that the election of parameters and techniques of DP is\ncentral in the aforementioned trade-off between privacy and utility by means of\na classification example.\n","authors":["Hadeel Abd El-Kareem","Abd El-Moaty Saleh","Ana Fernández-Vilas","Manuel Fernández-Veiga","asser El-Sonbaty"],"pdf_url":"https://arxiv.org/pdf/2311.16008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06627v2","updated":"2023-11-27T16:59:39Z","published":"2023-10-10T13:45:59Z","title":"What If the TV Was Off? Examining Counterfactual Reasoning Abilities of\n Multi-modal Language Models","summary":" Counterfactual reasoning, a fundamental aspect of human cognition, involves\ncontemplating alternatives to established facts or past events, significantly\nenhancing our abilities in planning and decision-making. In light of the\nadvancements in current multi-modal large language models, we explore their\neffectiveness in counterfactual reasoning. To facilitate this investigation, we\nintroduce a novel dataset, C-VQA, specifically designed to test the\ncounterfactual reasoning capabilities of modern multi-modal large language\nmodels. This dataset is constructed by infusing original questions with\ncounterfactual presuppositions, spanning various types such as numerical and\nboolean queries. It encompasses a mix of real and synthetic data, representing\na wide range of difficulty levels. Our thorough evaluations of contemporary\nvision-language models using this dataset have revealed substantial performance\ndrops, with some models showing up to a 40\\% decrease, highlighting a\nsignificant gap between current models and human-like vision reasoning\ncapabilities. We hope our dataset will serve as a vital benchmark for\nevaluating the counterfactual reasoning capabilities of models. Code and\ndataset are publicly available at https://bzhao.me/C-VQA/.\n","authors":["Letian Zhang","Xiaotong Zhai","Zhongkai Zhao","Yongshuo Zong","Xin Wen","Bingchen Zhao"],"pdf_url":"https://arxiv.org/pdf/2310.06627v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16004v1","updated":"2023-11-27T16:55:04Z","published":"2023-11-27T16:55:04Z","title":"Improved Data Generation for Enhanced Asset Allocation: A Synthetic\n Dataset Approach for the Fixed Income Universe","summary":" We present a novel process for generating synthetic datasets tailored to\nassess asset allocation methods and construct portfolios within the fixed\nincome universe. Our approach begins by enhancing the CorrGAN model to generate\nsynthetic correlation matrices. Subsequently, we propose an Encoder-Decoder\nmodel that samples additional data conditioned on a given correlation matrix.\nThe resulting synthetic dataset facilitates in-depth analyses of asset\nallocation methods across diverse asset universes. Additionally, we provide a\ncase study that exemplifies the use of the synthetic dataset to improve\nportfolios constructed within a simulation-based asset allocation process.\n","authors":["Szymon Kubiak","Tillman Weyde","Oleksandr Galkin","Dan Philps","Ram Gopal"],"pdf_url":"https://arxiv.org/pdf/2311.16004v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16003v1","updated":"2023-11-27T16:52:25Z","published":"2023-11-27T16:52:25Z","title":"Forecasting Auxiliary Energy Consumption for Electric Heavy-Duty\n Vehicles","summary":" Accurate energy consumption prediction is crucial for optimizing the\noperation of electric commercial heavy-duty vehicles, e.g., route planning for\ncharging. Moreover, understanding why certain predictions are cast is paramount\nfor such a predictive model to gain user trust and be deployed in practice.\nSince commercial vehicles operate differently as transportation tasks, ambient,\nand drivers vary, a heterogeneous population is expected when building an AI\nsystem for forecasting energy consumption. The dependencies between the input\nfeatures and the target values are expected to also differ across\nsub-populations. One well-known example of such a statistical phenomenon is the\nSimpson paradox. In this paper, we illustrate that such a setting poses a\nchallenge for existing XAI methods that produce global feature statistics, e.g.\nLIME or SHAP, causing them to yield misleading results. We demonstrate a\npotential solution by training multiple regression models on subsets of data.\nIt not only leads to superior regression performance but also more relevant and\nconsistent LIME explanations. Given that the employed groupings correspond to\nrelevant sub-populations, the associations between the input features and the\ntarget values are consistent within each cluster but different across clusters.\nExperiments on both synthetic and real-world datasets show that such splitting\nof a complex problem into simpler ones yields better regression performance and\ninterpretability.\n","authors":["Yuantao Fan","Zhenkan Wang","Sepideh Pashami","Slawomir Nowaczyk","Henrik Ydreskog"],"pdf_url":"https://arxiv.org/pdf/2311.16003v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08736v2","updated":"2023-11-27T16:49:49Z","published":"2023-08-17T02:18:59Z","title":"On the Effectiveness of Log Representation for Log-based Anomaly\n Detection","summary":" Logs are an essential source of information for people to understand the\nrunning status of a software system. Due to the evolving modern software\narchitecture and maintenance methods, more research efforts have been devoted\nto automated log analysis. In particular, machine learning (ML) has been widely\nused in log analysis tasks. In ML-based log analysis tasks, converting textual\nlog data into numerical feature vectors is a critical and indispensable step.\nHowever, the impact of using different log representation techniques on the\nperformance of the downstream models is not clear, which limits researchers and\npractitioners' opportunities of choosing the optimal log representation\ntechniques in their automated log analysis workflows. Therefore, this work\ninvestigates and compares the commonly adopted log representation techniques\nfrom previous log analysis research. Particularly, we select six log\nrepresentation techniques and evaluate them with seven ML models and four\npublic log datasets (i.e., HDFS, BGL, Spirit and Thunderbird) in the context of\nlog-based anomaly detection. We also examine the impacts of the log parsing\nprocess and the different feature aggregation approaches when they are employed\nwith log representation techniques. From the experiments, we provide some\nheuristic guidelines for future researchers and developers to follow when\ndesigning an automated log analysis workflow. We believe our comprehensive\ncomparison of log representation techniques can help researchers and\npractitioners better understand the characteristics of different log\nrepresentation techniques and provide them with guidance for selecting the most\nsuitable ones for their ML-based log analysis workflow.\n","authors":["Xingfang Wu","Heng Li","Foutse Khomh"],"pdf_url":"https://arxiv.org/pdf/2308.08736v2.pdf","comment":"Accepted by Journal of Empirical Software Engineering (EMSE)"},{"id":"http://arxiv.org/abs/2307.06255v2","updated":"2023-11-27T16:48:14Z","published":"2023-07-12T15:50:38Z","title":"Machine learning and Topological data analysis identify unique features\n of human papillae in 3D scans","summary":" The tongue surface houses a range of papillae that are integral to the\nmechanics and chemistry of taste and textural sensation. Although gustatory\nfunction of papillae is well investigated, the uniqueness of papillae within\nand across individuals remains elusive. Here, we present the first machine\nlearning framework on 3D microscopic scans of human papillae (n = 2092),\nuncovering the uniqueness of geometric and topological features of papillae.\nThe finer differences in shapes of papillae are investigated computationally\nbased on a number of features derived from discrete differential geometry and\ncomputational topology. Interpretable machine learning techniques show that\npersistent homology features of the papillae shape are the most effective in\npredicting the biological variables. Models trained on these features with\nsmall volumes of data samples predict the type of papillae with an accuracy of\n85%. The papillae type classification models can map the spatial arrangement of\nfiliform and fungiform papillae on a surface. Remarkably, the papillae are\nfound to be distinctive across individuals and an individual can be identified\nwith an accuracy of 48% among the 15 participants from a single papillae.\nCollectively, this is the first unprecedented evidence demonstrating that\ntongue papillae can serve as a unique identifier inspiring new research\ndirection for food preferences and oral diagnostics.\n","authors":["Rayna Andreeva","Anwesha Sarkar","Rik Sarkar"],"pdf_url":"https://arxiv.org/pdf/2307.06255v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16001v1","updated":"2023-11-27T16:47:09Z","published":"2023-11-27T16:47:09Z","title":"Automated Measurement of Vascular Calcification in Femoral\n Endarterectomy Patients Using Deep Learning","summary":" Atherosclerosis, a chronic inflammatory disease affecting the large arteries,\npresents a global health risk. Accurate analysis of diagnostic images, like\ncomputed tomographic angiograms (CTAs), is essential for staging and monitoring\nthe progression of atherosclerosis-related conditions, including peripheral\narterial disease (PAD). However, manual analysis of CTA images is\ntime-consuming and tedious. To address this limitation, we employed a deep\nlearning model to segment the vascular system in CTA images of PAD patients\nundergoing femoral endarterectomy surgery and to measure vascular calcification\nfrom the left renal artery to the patella. Utilizing proprietary CTA images of\n27 patients undergoing femoral endarterectomy surgery provided by Prisma Health\nMidlands, we developed a Deep Neural Network (DNN) model to first segment the\narterial system, starting from the descending aorta to the patella, and second,\nto provide a metric of arterial calcification. Our designed DNN achieved 83.4%\naverage Dice accuracy in segmenting arteries from aorta to patella, advancing\nthe state-of-the-art by 0.8%. Furthermore, our work is the first to present a\nrobust statistical analysis of automated calcification measurement in the lower\nextremities using deep learning, attaining a Mean Absolute Percentage Error\n(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and\nmanual calcification scores. These findings underscore the potential of deep\nlearning techniques as a rapid and accurate tool for medical professionals to\nassess calcification in the abdominal aorta and its branches above the patella.\nThe developed DNN model and related documentation in this project are available\nat GitHub page at https://github.com/pip-alireza/DeepCalcScoring.\n","authors":["Alireza Bagheri Rajeoni","Breanna Pederson","Daniel G. Clair","Susan M. Lessner","Homayoun Valafar"],"pdf_url":"https://arxiv.org/pdf/2311.16001v1.pdf","comment":"Published in MDPI Diagnostic journal, the code can be accessed via\n the GitHub link in the paper"},{"id":"http://arxiv.org/abs/2310.10541v2","updated":"2023-11-27T16:45:18Z","published":"2023-10-16T16:13:53Z","title":"AST: Effective Dataset Distillation through Alignment with Smooth and\n High-Quality Expert Trajectories","summary":" Training large AI models typically requires large-scale datasets in the\nmachine learning process, making training and parameter-tuning process both\ntime-consuming and costly. Some researchers address this problem by carefully\nsynthesizing a very small number of highly representative and informative\nsamples from real-world datasets. This approach, known as Dataset Distillation\n(DD), proposes a perspective for data-efficient learning. Despite recent\nprogress in this field, the performance of existing methods still cannot meet\nexpectations, and distilled datasets cannot effectively replace original\ndatasets. In this paper, unlike previous methods that focus solely on improving\nthe effectiveness of student distillation, we recognize and leverage the\nimportant mutual influence between expert and student models. We observed that\nthe smoothness of expert trajectories has a significant impact on subsequent\nstudent parameter alignment. Based on this, we propose an effective DD\nframework named AST, standing for Alignment with Smooth and high-quality expert\nTrajectories. We devise the integration of clipping loss and gradient penalty\nto regulate the rate of parameter changes in expert trajectory generation. To\nfurther refine the student parameter alignment with expert trajectory, we put\nforward representative initialization for the synthetic dataset and balanced\ninner-loop loss in response to the sensitivity exhibited towards randomly\ninitialized variables during distillation. We also propose two enhancement\nstrategies, namely intermediate matching loss and weight perturbation, to\nmitigate the potential occurrence of cumulative errors. We conduct extensive\nexperiments on datasets of different scales, sizes, and resolutions. The\nresults demonstrate that the proposed method significantly outperforms prior\nmethods.\n","authors":["Jiyuan Shen","Wenzhuo Yang","Kwok-Yan Lam"],"pdf_url":"https://arxiv.org/pdf/2310.10541v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15996v1","updated":"2023-11-27T16:44:50Z","published":"2023-11-27T16:44:50Z","title":"Closing the ODE-SDE gap in score-based diffusion models through the\n Fokker-Planck equation","summary":" Score-based diffusion models have emerged as one of the most promising\nframeworks for deep generative modelling, due to their state-of-the art\nperformance in many generation tasks while relying on mathematical foundations\nsuch as stochastic differential equations (SDEs) and ordinary differential\nequations (ODEs). Empirically, it has been reported that ODE based samples are\ninferior to SDE based samples. In this paper we rigorously describe the range\nof dynamics and approximations that arise when training score-based diffusion\nmodels, including the true SDE dynamics, the neural approximations, the various\napproximate particle dynamics that result, as well as their associated\nFokker--Planck equations and the neural network approximations of these\nFokker--Planck equations. We systematically analyse the difference between the\nODE and SDE dynamics of score-based diffusion models, and link it to an\nassociated Fokker--Planck equation. We derive a theoretical upper bound on the\nWasserstein 2-distance between the ODE- and SDE-induced distributions in terms\nof a Fokker--Planck residual. We also show numerically that conventional\nscore-based diffusion models can exhibit significant differences between ODE-\nand SDE-induced distributions which we demonstrate using explicit comparisons.\nMoreover, we show numerically that reducing the Fokker--Planck residual by\nadding it as an additional regularisation term leads to closing the gap between\nODE- and SDE-induced distributions. Our experiments suggest that this\nregularisation can improve the distribution generated by the ODE, however that\nthis can come at the cost of degraded SDE sample quality.\n","authors":["Teo Deveney","Jan Stanczuk","Lisa Maria Kreusser","Chris Budd","Carola-Bibiane Schönlieb"],"pdf_url":"https://arxiv.org/pdf/2311.15996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15995v1","updated":"2023-11-27T16:44:13Z","published":"2023-11-27T16:44:13Z","title":"Sensitivity-Based Layer Insertion for Residual and Feedforward Neural\n Networks","summary":" The training of neural networks requires tedious and often manual tuning of\nthe network architecture. We propose a systematic method to insert new layers\nduring the training process, which eliminates the need to choose a fixed\nnetwork size before training. Our technique borrows techniques from constrained\noptimization and is based on first-order sensitivity information of the\nobjective with respect to the virtual parameters that additional layers, if\ninserted, would offer. We consider fully connected feedforward networks with\nselected activation functions as well as residual neural networks. In numerical\nexperiments, the proposed sensitivity-based layer insertion technique exhibits\nimproved training decay, compared to not inserting the layer. Furthermore, the\ncomputational effort is reduced in comparison to inserting the layer from the\nbeginning. The code is available at\n\\url{https://github.com/LeonieKreis/layer_insertion_sensitivity_based}.\n","authors":["Evelyn Herberg","Roland Herzog","Frederik Köhne","Leonie Kreis","Anton Schiela"],"pdf_url":"https://arxiv.org/pdf/2311.15995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15990v1","updated":"2023-11-27T16:39:55Z","published":"2023-11-27T16:39:55Z","title":"Should We Learn Most Likely Functions or Parameters?","summary":" Standard regularized training procedures correspond to maximizing a posterior\ndistribution over parameters, known as maximum a posteriori (MAP) estimation.\nHowever, model parameters are of interest only insomuch as they combine with\nthe functional form of a model to provide a function that can make good\npredictions. Moreover, the most likely parameters under the parameter posterior\ndo not generally correspond to the most likely function induced by the\nparameter posterior. In fact, we can re-parametrize a model such that any\nsetting of parameters can maximize the parameter posterior. As an alternative,\nwe investigate the benefits and drawbacks of directly estimating the most\nlikely function implied by the model and the data. We show that this procedure\nleads to pathological solutions when using neural networks and prove conditions\nunder which the procedure is well-behaved, as well as a scalable approximation.\nUnder these conditions, we find that function-space MAP estimation can lead to\nflatter minima, better generalization, and improved robustness to overfitting.\n","authors":["Shikai Qiu","Tim G. J. Rudner","Sanyam Kapoor","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2311.15990v1.pdf","comment":"NeurIPS 2023. Code available at\n https://github.com/activatedgeek/function-space-map"},{"id":"http://arxiv.org/abs/2303.01486v4","updated":"2023-11-27T16:36:53Z","published":"2023-03-02T18:47:51Z","title":"Understanding plasticity in neural networks","summary":" Plasticity, the ability of a neural network to quickly change its predictions\nin response to new information, is essential for the adaptability and\nrobustness of deep reinforcement learning systems. Deep neural networks are\nknown to lose plasticity over the course of training even in relatively simple\nlearning problems, but the mechanisms driving this phenomenon are still poorly\nunderstood. This paper conducts a systematic empirical analysis into plasticity\nloss, with the goal of understanding the phenomenon mechanistically in order to\nguide the future development of targeted solutions. We find that loss of\nplasticity is deeply connected to changes in the curvature of the loss\nlandscape, but that it often occurs in the absence of saturated units. Based on\nthis insight, we identify a number of parameterization and optimization design\nchoices which enable networks to better preserve plasticity over the course of\ntraining. We validate the utility of these findings on larger-scale RL\nbenchmarks in the Arcade Learning Environment.\n","authors":["Clare Lyle","Zeyu Zheng","Evgenii Nikishin","Bernardo Avila Pires","Razvan Pascanu","Will Dabney"],"pdf_url":"https://arxiv.org/pdf/2303.01486v4.pdf","comment":"Accepted to ICML 2023 (oral presentation)"},{"id":"http://arxiv.org/abs/2311.15983v1","updated":"2023-11-27T16:28:20Z","published":"2023-11-27T16:28:20Z","title":"Sparsify-then-Classify: From Internal Neurons of Large Language Models\n To Efficient Text Classifiers","summary":" Among the many tasks that Large Language Models (LLMs) have revolutionized is\ntext classification. However, existing approaches for applying pretrained LLMs\nto text classification predominantly rely on using single token outputs from\nonly the last layer of hidden states. As a result, they suffer from limitations\nin efficiency, task-specificity, and interpretability. In our work, we\ncontribute an approach that uses all internal representations by employing\nmultiple pooling strategies on all activation and hidden states. Our novel\nlightweight strategy, Sparsify-then-Classify (STC) first sparsifies\ntask-specific features layer-by-layer, then aggregates across layers for text\nclassification. STC can be applied as a seamless plug-and-play module on top of\nexisting LLMs. Our experiments on a comprehensive set of models and datasets\ndemonstrate that STC not only consistently improves the classification\nperformance of pretrained and fine-tuned models, but is also more efficient for\nboth training and inference, and is more intrinsically interpretable.\n","authors":["Yilun Liu","Difan Jiao","Ashton Anderson"],"pdf_url":"https://arxiv.org/pdf/2311.15983v1.pdf","comment":"23 pages, 5 figures, 8 tables Code available at\n https://github.com/difanj0713/Sparsify-then-Classify"},{"id":"http://arxiv.org/abs/2311.15979v1","updated":"2023-11-27T16:25:12Z","published":"2023-11-27T16:25:12Z","title":"Soil Organic Carbon Estimation from Climate-related Features with Graph\n Neural Network","summary":" Soil organic carbon (SOC) plays a pivotal role in the global carbon cycle,\nimpacting climate dynamics and necessitating accurate estimation for\nsustainable land and agricultural management. While traditional methods of SOC\nestimation face resolution and accuracy challenges, recent technological\nsolutions harness remote sensing, machine learning, and high-resolution\nsatellite mapping. Graph Neural Networks (GNNs), especially when integrated\nwith positional encoders, can capture complex relationships between soil and\nclimate. Using the LUCAS database, this study compared four GNN operators in\nthe positional encoder framework. Results revealed that the PESAGE and\nPETransformer models outperformed others in SOC estimation, indicating their\npotential in capturing the complex relationship between SOC and climate\nfeatures. Our findings confirm the feasibility of applications of GNN\narchitectures in SOC prediction, establishing a framework for future\nexplorations of this topic with more advanced GNN models.\n","authors":["Weiying Zhao","Natalia Efremova"],"pdf_url":"https://arxiv.org/pdf/2311.15979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00553v3","updated":"2023-11-27T16:24:59Z","published":"2023-04-02T15:04:43Z","title":"From Isolated Islands to Pangea: Unifying Semantic Space for Human\n Action Understanding","summary":" As a vital step toward the intelligent agent, Action understanding matters\nfor intelligent agents and has attracted long-term attention. It can be formed\nas the mapping from the action physical space to the semantic space. Typically,\nresearchers built action datasets according to idiosyncratic choices to define\nclasses and push the envelope of benchmarks respectively. Thus, datasets are\nincompatible with each other like \"Isolated Islands\" due to semantic gaps and\nvarious class granularities, e.g., do housework in dataset A and wash plate in\ndataset B. We argue that a more principled semantic space is an urgent need to\nconcentrate the community efforts and enable us to use all datasets together to\npursue generalizable action learning. To this end, we design a structured\naction semantic space in view of verb taxonomy hierarchy and covering massive\nactions. By aligning the classes of previous datasets to our semantic space, we\ngather (image/video/skeleton/MoCap) datasets into a unified database in a\nunified label system, i.e., bridging ``isolated islands'' into a \"Pangea\".\nAccordingly, we propose a novel model mapping from the physical space to\nsemantic space to fully use Pangea. In extensive experiments, our new system\nshows significant superiority, especially in transfer learning. Code and data\nwill be made publicly available.\n","authors":["Yong-Lu Li","Xiaoqian Wu","Xinpeng Liu","Zehao Wang","Yiming Dou","Yikun Ji","Junyi Zhang","Yixing Li","Jingru Tan","Xudong Lu","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2304.00553v3.pdf","comment":"Project Webpage: https://mvig-rhos.com/pangea"},{"id":"http://arxiv.org/abs/2308.07037v4","updated":"2023-11-27T16:15:44Z","published":"2023-08-14T09:56:35Z","title":"Bayesian Flow Networks","summary":" This paper introduces Bayesian Flow Networks (BFNs), a new class of\ngenerative model in which the parameters of a set of independent distributions\nare modified with Bayesian inference in the light of noisy data samples, then\npassed as input to a neural network that outputs a second, interdependent\ndistribution. Starting from a simple prior and iteratively updating the two\ndistributions yields a generative procedure similar to the reverse process of\ndiffusion models; however it is conceptually simpler in that no forward process\nis required. Discrete and continuous-time loss functions are derived for\ncontinuous, discretised and discrete data, along with sample generation\nprocedures. Notably, the network inputs for discrete data lie on the\nprobability simplex, and are therefore natively differentiable, paving the way\nfor gradient-based sample guidance and few-step generation in discrete domains\nsuch as language modelling. The loss function directly optimises data\ncompression and places no restrictions on the network architecture. In our\nexperiments BFNs achieve competitive log-likelihoods for image modelling on\ndynamically binarized MNIST and CIFAR-10, and outperform all known discrete\ndiffusion models on the text8 character-level language modelling task.\n","authors":["Alex Graves","Rupesh Kumar Srivastava","Timothy Atkinson","Faustino Gomez"],"pdf_url":"https://arxiv.org/pdf/2308.07037v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.05400v5","updated":"2023-11-27T16:08:49Z","published":"2022-03-10T14:45:57Z","title":"Asymptotic Bounds for Smoothness Parameter Estimates in Gaussian Process\n Interpolation","summary":" It is common to model a deterministic response function, such as the output\nof a computer experiment, as a Gaussian process with a Mat\\'ern covariance\nkernel. The smoothness parameter of a Mat\\'ern kernel determines many important\nproperties of the model in the large data limit, including the rate of\nconvergence of the conditional mean to the response function. We prove that the\nmaximum likelihood estimate of the smoothness parameter cannot asymptotically\nundersmooth the truth when the data are obtained on a fixed bounded subset of\n$\\mathbb{R}^d$. That is, if the data-generating response function has Sobolev\nsmoothness $\\nu_0 > d/2$, then the smoothness parameter estimate cannot be\nasymptotically less than $\\nu_0$. The lower bound is sharp. Additionally, we\nshow that maximum likelihood estimation recovers the true smoothness for a\nclass of compactly supported self-similar functions. For cross-validation we\nprove an asymptotic lower bound $\\nu_0 - d/2$, which however is unlikely to be\nsharp. The results are based on approximation theory in Sobolev spaces and some\ngeneral theorems that restrict the set of values that the parameter estimators\ncan take.\n","authors":["Toni Karvonen"],"pdf_url":"https://arxiv.org/pdf/2203.05400v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15966v1","updated":"2023-11-27T16:07:49Z","published":"2023-11-27T16:07:49Z","title":"Towards Transfer Learning for Large-Scale Image Classification Using\n Annealing-based Quantum Boltzmann Machines","summary":" Quantum Transfer Learning (QTL) recently gained popularity as a hybrid\nquantum-classical approach for image classification tasks by efficiently\ncombining the feature extraction capabilities of large Convolutional Neural\nNetworks with the potential benefits of Quantum Machine Learning (QML).\nExisting approaches, however, only utilize gate-based Variational Quantum\nCircuits for the quantum part of these procedures. In this work we present an\napproach to employ Quantum Annealing (QA) in QTL-based image classification.\nSpecifically, we propose using annealing-based Quantum Boltzmann Machines as\npart of a hybrid quantum-classical pipeline to learn the classification of\nreal-world, large-scale data such as medical images through supervised\ntraining. We demonstrate our approach by applying it to the three-class\nCOVID-CT-MD dataset, a collection of lung Computed Tomography (CT) scan slices.\nUsing Simulated Annealing as a stand-in for actual QA, we compare our method to\nclassical transfer learning, using a neural network of the same order of\nmagnitude, to display its improved classification performance. We find that our\napproach consistently outperforms its classical baseline in terms of test\naccuracy and AUC-ROC-Score and needs less training epochs to do this.\n","authors":["Daniëlle Schuman","Leo Sünkel","Philipp Altmann","Jonas Stein","Christoph Roch","Thomas Gabor","Claudia Linnhoff-Popien"],"pdf_url":"https://arxiv.org/pdf/2311.15966v1.pdf","comment":"7 pages, 3 figures (5 if counting subfigures), 1 table. To be\n published in the proceedings of the 2023 IEEE International Conference on\n Quantum Computing and Engineering (QCE)"},{"id":"http://arxiv.org/abs/2311.15964v1","updated":"2023-11-27T16:07:37Z","published":"2023-11-27T16:07:37Z","title":"Efficient Pre-training for Localized Instruction Generation of Videos","summary":" Procedural videos show step-by-step demonstrations of tasks like recipe\npreparation. Understanding such videos is challenging, involving the precise\nlocalization of steps and the generation of textual instructions. Manually\nannotating steps and writing instructions is costly, which limits the size of\ncurrent datasets and hinders effective learning. Leveraging large but noisy\nvideo-transcript datasets for pre-training can boost performance, but demands\nsignificant computational resources. Furthermore, transcripts contain\nirrelevant content and exhibit style variation compared to instructions written\nby human annotators. To mitigate both issues, we propose a technique,\nSieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters\nirrelevant transcripts and (ii) Swap enhances the quality of the text\ninstruction by automatically replacing the transcripts with human-written\ninstructions from a text-only recipe dataset. The curated dataset, three orders\nof magnitude smaller than current web-scale datasets, enables efficient\ntraining of large-scale models with competitive performance. We complement our\nSieve-\\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step\nlocalization and instruction generation for procedural videos. When this model\nis pre-trained on our curated dataset, it achieves state-of-the-art performance\nin zero-shot and finetuning settings on YouCook2 and Tasty, while using a\nfraction of the computational resources.\n","authors":["Anil Batra","Davide Moltisanti","Laura Sevilla-Lara","Marcus Rohrbach","Frank Keller"],"pdf_url":"https://arxiv.org/pdf/2311.15964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15961v1","updated":"2023-11-27T16:06:48Z","published":"2023-11-27T16:06:48Z","title":"Maximum Likelihood Estimation is All You Need for Well-Specified\n Covariate Shift","summary":" A key challenge of modern machine learning systems is to achieve\nOut-of-Distribution (OOD) generalization -- generalizing to target data whose\ndistribution differs from that of source data. Despite its significant\nimportance, the fundamental question of ``what are the most effective\nalgorithms for OOD generalization'' remains open even under the standard\nsetting of covariate shift. This paper addresses this fundamental question by\nproving that, surprisingly, classical Maximum Likelihood Estimation (MLE)\npurely using source data (without any modification) achieves the minimax\noptimality for covariate shift under the well-specified setting. That is, no\nalgorithm performs better than MLE in this setting (up to a constant factor),\njustifying MLE is all you need. Our result holds for a very rich class of\nparametric models, and does not require any boundedness condition on the\ndensity ratio. We illustrate the wide applicability of our framework by\ninstantiating it to three concrete examples -- linear regression, logistic\nregression, and phase retrieval. This paper further complement the study by\nproving that, under the misspecified setting, MLE is no longer the optimal\nchoice, whereas Maximum Weighted Likelihood Estimator (MWLE) emerges as minimax\noptimal in certain scenarios.\n","authors":["Jiawei Ge","Shange Tang","Jianqing Fan","Cong Ma","Chi Jin"],"pdf_url":"https://arxiv.org/pdf/2311.15961v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15960v1","updated":"2023-11-27T16:06:39Z","published":"2023-11-27T16:06:39Z","title":"Addressing Long-Horizon Tasks by Integrating Program Synthesis and State\n Machines","summary":" Deep reinforcement learning excels in various domains but lacks\ngeneralizability and interoperability. Programmatic RL methods (Trivedi et al.,\n2021; Liu et al., 2023) reformulate solving RL tasks as synthesizing\ninterpretable programs that can be executed in the environments. Despite\nencouraging results, these methods are limited to short-horizon tasks. On the\nother hand, representing RL policies using state machines (Inala et al., 2020)\ncan inductively generalize to long-horizon tasks; however, it struggles to\nscale up to acquire diverse and complex behaviors. This work proposes Program\nMachine Policies (POMPs), which bridge the advantages of programmatic RL and\nstate machine policies, allowing for the representation of complex behaviors\nand the address of long-term tasks. Specifically, we introduce a method that\ncan retrieve a set of effective, diverse, compatible programs. Then, we use\nthese programs as modes of a state machine and learn a transition function to\ntransition among mode programs, allowing for capturing long-horizon repetitive\nbehaviors. Our proposed framework outperforms programmatic RL and deep RL\nbaselines on various tasks and demonstrates the ability to generalize to even\nlonger horizons without any fine-tuning inductively. Ablation studies justify\nthe effectiveness of our proposed search algorithm for retrieving a set of\nprograms as modes.\n","authors":["Yu-An Lin","Chen-Tao Lee","Guan-Ting Liu","Pu-Jen Cheng","Shao-Hua Sun"],"pdf_url":"https://arxiv.org/pdf/2311.15960v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.09347v3","updated":"2023-11-27T15:59:55Z","published":"2022-03-17T14:26:28Z","title":"Dimensionality Reduction and Wasserstein Stability for Kernel Regression","summary":" In a high-dimensional regression framework, we study consequences of the\nnaive two-step procedure where first the dimension of the input variables is\nreduced and second, the reduced input variables are used to predict the output\nvariable with kernel regression. In order to analyze the resulting regression\nerrors, a novel stability result for kernel regression with respect to the\nWasserstein distance is derived. This allows us to bound errors that occur when\nperturbed input data is used to fit the regression function. We apply the\ngeneral stability result to principal component analysis (PCA). Exploiting\nknown estimates from the literature on both principal component analysis and\nkernel regression, we deduce convergence rates for the two-step procedure. The\nlatter turns out to be particularly useful in a semi-supervised setting.\n","authors":["Stephan Eckstein","Armin Iske","Mathias Trabs"],"pdf_url":"https://arxiv.org/pdf/2203.09347v3.pdf","comment":"Forthcoming in JMLR"},{"id":"http://arxiv.org/abs/2311.10093v2","updated":"2023-11-27T15:58:30Z","published":"2023-11-16T18:59:51Z","title":"The Chosen One: Consistent Characters in Text-to-Image Diffusion Models","summary":" Recent advances in text-to-image generation models have unlocked vast\npotential for visual creativity. However, these models struggle with generation\nof consistent characters, a crucial aspect for numerous real-world applications\nsuch as story visualization, game development asset design, advertising, and\nmore. Current methods typically rely on multiple pre-existing images of the\ntarget character or involve labor-intensive manual processes. In this work, we\npropose a fully automated solution for consistent character generation, with\nthe sole input being a text prompt. We introduce an iterative procedure that,\nat each stage, identifies a coherent set of images sharing a similar identity\nand extracts a more consistent identity from this set. Our quantitative\nanalysis demonstrates that our method strikes a better balance between prompt\nalignment and identity consistency compared to the baseline methods, and these\nfindings are reinforced by a user study. To conclude, we showcase several\npractical applications of our approach. Project page is available at\nhttps://omriavrahami.com/the-chosen-one\n","authors":["Omri Avrahami","Amir Hertz","Yael Vinker","Moab Arar","Shlomi Fruchter","Ohad Fried","Daniel Cohen-Or","Dani Lischinski"],"pdf_url":"https://arxiv.org/pdf/2311.10093v2.pdf","comment":"Project page is available at https://omriavrahami.com/the-chosen-one"},{"id":"http://arxiv.org/abs/2311.15951v1","updated":"2023-11-27T15:57:11Z","published":"2023-11-27T15:57:11Z","title":"Replay across Experiments: A Natural Extension of Off-Policy RL","summary":" Replaying data is a principal mechanism underlying the stability and data\nefficiency of off-policy reinforcement learning (RL). We present an effective\nyet simple framework to extend the use of replays across multiple experiments,\nminimally adapting the RL workflow for sizeable improvements in controller\nperformance and research iteration times. At its core, Replay Across\nExperiments (RaE) involves reusing experience from previous experiments to\nimprove exploration and bootstrap learning while reducing required changes to a\nminimum in comparison to prior work. We empirically show benefits across a\nnumber of RL algorithms and challenging control domains spanning both\nlocomotion and manipulation, including hard exploration tasks from egocentric\nvision. Through comprehensive ablations, we demonstrate robustness to the\nquality and amount of data available and various hyperparameter choices.\nFinally, we discuss how our approach can be applied more broadly across\nresearch life cycles and can increase resilience by reloading data across\nrandom seeds or hyperparameter variations.\n","authors":["Dhruva Tirumala","Thomas Lampe","Jose Enrique Chen","Tuomas Haarnoja","Sandy Huang","Guy Lever","Ben Moran","Tim Hertweck","Leonard Hasenclever","Martin Riedmiller","Nicolas Heess","Markus Wulfmeier"],"pdf_url":"https://arxiv.org/pdf/2311.15951v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00577v2","updated":"2023-11-27T15:57:06Z","published":"2023-06-01T11:45:45Z","title":"TorchRL: A data-driven decision-making library for PyTorch","summary":" PyTorch has ascended as a premier machine learning framework, yet it lacks a\nnative and comprehensive library for decision and control tasks suitable for\nlarge development teams dealing with complex real-world data and environments.\nTo address this issue, we propose TorchRL, a generalistic control library for\nPyTorch that provides well-integrated, yet standalone components. We introduce\na new and flexible PyTorch primitive, the TensorDict, which facilitates\nstreamlined algorithm development across the many branches of Reinforcement\nLearning (RL) and control. We provide a detailed description of the building\nblocks and an extensive overview of the library across domains and tasks.\nFinally, we experimentally demonstrate its reliability and flexibility and show\ncomparative benchmarks to demonstrate its computational efficiency. TorchRL\nfosters long-term support and is publicly available on GitHub for greater\nreproducibility and collaboration within the research community. The code is\nopen-sourced on GitHub.\n","authors":["Albert Bou","Matteo Bettini","Sebastian Dittert","Vikash Kumar","Shagun Sodhani","Xiaomeng Yang","Gianni De Fabritiis","Vincent Moens"],"pdf_url":"https://arxiv.org/pdf/2306.00577v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15947v1","updated":"2023-11-27T15:54:20Z","published":"2023-11-27T15:54:20Z","title":"GloNets: Globally Connected Neural Networks","summary":" Deep learning architectures suffer from depth-related performance\ndegradation, limiting the effective depth of neural networks. Approaches like\nResNet are able to mitigate this, but they do not completely eliminate the\nproblem. We introduce Globally Connected Neural Networks (GloNet), a novel\narchitecture overcoming depth-related issues, designed to be superimposed on\nany model, enhancing its depth without increasing complexity or reducing\nperformance. With GloNet, the network's head uniformly receives information\nfrom all parts of the network, regardless of their level of abstraction. This\nenables GloNet to self-regulate information flow during training, reducing the\ninfluence of less effective deeper layers, and allowing for stable training\nirrespective of network depth. This paper details GloNet's design, its\ntheoretical basis, and a comparison with existing similar architectures.\nExperiments show GloNet's self-regulation ability and resilience to\ndepth-related learning challenges, like performance degradation. Our findings\nsuggest GloNet as a strong alternative to traditional architectures like\nResNets.\n","authors":["Antonio Di Cecco","Carlo Metta","Marco Fantozzi","Francesco Morandin","Maurizio Parton"],"pdf_url":"https://arxiv.org/pdf/2311.15947v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15945v1","updated":"2023-11-27T15:51:07Z","published":"2023-11-27T15:51:07Z","title":"Over-Squashing in Riemannian Graph Neural Networks","summary":" Most graph neural networks (GNNs) are prone to the phenomenon of\nover-squashing in which node features become insensitive to information from\ndistant nodes in the graph. Recent works have shown that the topology of the\ngraph has the greatest impact on over-squashing, suggesting graph rewiring\napproaches as a suitable solution. In this work, we explore whether\nover-squashing can be mitigated through the embedding space of the GNN. In\nparticular, we consider the generalization of Hyperbolic GNNs (HGNNs) to\nRiemannian manifolds of variable curvature in which the geometry of the\nembedding space is faithful to the graph's topology. We derive bounds on the\nsensitivity of the node features in these Riemannian GNNs as the number of\nlayers increases, which yield promising theoretical and empirical results for\nalleviating over-squashing in graphs with negative curvature.\n","authors":["Julia Balla"],"pdf_url":"https://arxiv.org/pdf/2311.15945v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15940v1","updated":"2023-11-27T15:47:33Z","published":"2023-11-27T15:47:33Z","title":"Physics-informed neural networks for transformed geometries and\n manifolds","summary":" Physics-informed neural networks (PINNs) effectively embed physical\nprinciples into machine learning, but often struggle with complex or\nalternating geometries. We propose a novel method for integrating geometric\ntransformations within PINNs to robustly accommodate geometric variations. Our\nmethod incorporates a diffeomorphism as a mapping of a reference domain and\nadapts the derivative computation of the physics-informed loss function. This\ngeneralizes the applicability of PINNs not only to smoothly deformed domains,\nbut also to lower-dimensional manifolds and allows for direct shape\noptimization while training the network. We demonstrate the effectivity of our\napproach on several problems: (i) Eikonal equation on Archimedean spiral, (ii)\nPoisson problem on surface manifold, (iii) Incompressible Stokes flow in\ndeformed tube, and (iv) Shape optimization with Laplace operator. Through these\nexamples, we demonstrate the enhanced flexibility over traditional PINNs,\nespecially under geometric variations. The proposed framework presents an\noutlook for training deep neural operators over parametrized geometries, paving\nthe way for advanced modeling with PDEs on complex geometries in science and\nengineering.\n","authors":["Samuel Burbulla"],"pdf_url":"https://arxiv.org/pdf/2311.15940v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15936v1","updated":"2023-11-27T15:45:02Z","published":"2023-11-27T15:45:02Z","title":"Towards Responsible Governance of Biological Design Tools","summary":" Recent advancements in generative machine learning have enabled rapid\nprogress in biological design tools (BDTs) such as protein structure and\nsequence prediction models. The unprecedented predictive accuracy and novel\ndesign capabilities of BDTs present new and significant dual-use risks. For\nexample, their predictive accuracy allows biological agents, whether vaccines\nor pathogens, to be developed more quickly, while the design capabilities could\nbe used to discover drugs or evade DNA screening techniques. Similar to other\ndual-use AI systems, BDTs present a wicked problem: how can regulators uphold\npublic safety without stifling innovation? We highlight how current regulatory\nproposals that are primarily tailored toward large language models may be less\neffective for BDTs, which require fewer computational resources to train and\nare often developed in an open-source manner. We propose a range of measures to\nmitigate the risk that BDTs are misused, across the areas of responsible\ndevelopment, risk assessment, transparency, access management, cybersecurity,\nand investing in resilience. Implementing such measures will require close\ncoordination between developers and governments.\n","authors":["Richard Moulange","Max Langenkamp","Tessa Alexanian","Samuel Curtis","Morgan Livingston"],"pdf_url":"https://arxiv.org/pdf/2311.15936v1.pdf","comment":"10 pages + references, 1 figure, accepted at NeurIPS 2023 Regulatable\n ML as oral presentation"},{"id":"http://arxiv.org/abs/2307.06431v2","updated":"2023-11-27T15:38:32Z","published":"2023-07-12T19:51:49Z","title":"Energy Discrepancies: A Score-Independent Loss for Energy-Based Models","summary":" Energy-based models are a simple yet powerful class of probabilistic models,\nbut their widespread adoption has been limited by the computational burden of\ntraining them. We propose a novel loss function called Energy Discrepancy (ED)\nwhich does not rely on the computation of scores or expensive Markov chain\nMonte Carlo. We show that ED approaches the explicit score matching and\nnegative log-likelihood loss under different limits, effectively interpolating\nbetween both. Consequently, minimum ED estimation overcomes the problem of\nnearsightedness encountered in score-based estimation methods, while also\nenjoying theoretical guarantees. Through numerical experiments, we demonstrate\nthat ED learns low-dimensional data distributions faster and more accurately\nthan explicit score matching or contrastive divergence. For high-dimensional\nimage data, we describe how the manifold hypothesis puts limitations on our\napproach and demonstrate the effectiveness of energy discrepancy by training\nthe energy-based model as a prior of a variational decoder model.\n","authors":["Tobias Schröder","Zijing Ou","Jen Ning Lim","Yingzhen Li","Sebastian J. Vollmer","Andrew B. Duncan"],"pdf_url":"https://arxiv.org/pdf/2307.06431v2.pdf","comment":"Camera Ready version for the 37th Conference on Neural Information\n Processing Systems (NeurIPS 2023). Changes in this revision: Appendix A1:\n Corrected proof of Theorem 1. Appendix D3: Added definition and numerical\n experiments for energy discrepancy on binary discrete spaces. Minor changes\n in the main text and correction of typos. Added new references"},{"id":"http://arxiv.org/abs/2311.15925v1","updated":"2023-11-27T15:37:05Z","published":"2023-11-27T15:37:05Z","title":"Reinforcement Learning for Wildfire Mitigation in Simulated Disaster\n Environments","summary":" Climate change has resulted in a year over year increase in adverse weather\nand weather conditions which contribute to increasingly severe fire seasons.\nWithout effective mitigation, these fires pose a threat to life, property,\necology, cultural heritage, and critical infrastructure. To better prepare for\nand react to the increasing threat of wildfires, more accurate fire modelers\nand mitigation responses are necessary. In this paper, we introduce SimFire, a\nversatile wildland fire projection simulator designed to generate realistic\nwildfire scenarios, and SimHarness, a modular agent-based machine learning\nwrapper capable of automatically generating land management strategies within\nSimFire to reduce the overall damage to the area. Together, this publicly\navailable system allows researchers and practitioners the ability to emulate\nand assess the effectiveness of firefighter interventions and formulate\nstrategic plans that prioritize value preservation and resource allocation\noptimization. The repositories are available for download at\nhttps://github.com/mitrefireline.\n","authors":["Alexander Tapley","Marissa Dotter","Michael Doyle","Aidan Fennelly","Dhanuj Gandikota","Savanna Smith","Michael Threet","Tim Welsh"],"pdf_url":"https://arxiv.org/pdf/2311.15925v1.pdf","comment":"12 pages, 4 figures including Appendices (A, B). Accepted as a paper\n in the Proposals track at the \"Tackling Climate Change with Machine Learning\"\n workshop at NeurIPS 2023. MITRE Public Release Case Number 23-3920"},{"id":"http://arxiv.org/abs/2311.15924v1","updated":"2023-11-27T15:34:40Z","published":"2023-11-27T15:34:40Z","title":"Diagnosis driven Anomaly Detection for CPS","summary":" In Cyber-Physical Systems (CPS) research, anomaly detection (detecting\nabnormal behavior) and diagnosis (identifying the underlying root cause) are\noften treated as distinct, isolated tasks. However, diagnosis algorithms\nrequire symptoms, i.e. temporally and spatially isolated anomalies, as input.\nThus, anomaly detection and diagnosis must be developed together to provide a\nholistic solution for diagnosis in CPS. We therefore propose a method for\nutilizing deep learning-based anomaly detection to generate inputs for\nConsistency-Based Diagnosis (CBD). We evaluate our approach on a simulated and\na real-world CPS dataset, where our model demonstrates strong performance\nrelative to other state-of-the-art models.\n","authors":["Henrik S. Steude","Lukas Moddemann","Alexander Diedrich","Jonas Ehrhardt","Oliver Niggemann"],"pdf_url":"https://arxiv.org/pdf/2311.15924v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07590v2","updated":"2023-11-27T15:17:49Z","published":"2023-11-09T17:12:44Z","title":"Technical Report: Large Language Models can Strategically Deceive their\n Users when Put Under Pressure","summary":" We demonstrate a situation in which Large Language Models, trained to be\nhelpful, harmless, and honest, can display misaligned behavior and\nstrategically deceive their users about this behavior without being instructed\nto do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated\nenvironment, where it assumes the role of an autonomous stock trading agent.\nWithin this environment, the model obtains an insider tip about a lucrative\nstock trade and acts upon it despite knowing that insider trading is\ndisapproved of by company management. When reporting to its manager, the model\nconsistently hides the genuine reasons behind its trading decision. We perform\na brief investigation of how this behavior varies under changes to the setting,\nsuch as removing model access to a reasoning scratchpad, attempting to prevent\nthe misaligned behavior by changing system instructions, changing the amount of\npressure the model is under, varying the perceived risk of getting caught, and\nmaking other simple changes to the environment. To our knowledge, this is the\nfirst demonstration of Large Language Models trained to be helpful, harmless,\nand honest, strategically deceiving their users in a realistic situation\nwithout direct instructions or training for deception.\n","authors":["Jérémy Scheurer","Mikita Balesni","Marius Hobbhahn"],"pdf_url":"https://arxiv.org/pdf/2311.07590v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15906v1","updated":"2023-11-27T15:13:02Z","published":"2023-11-27T15:13:02Z","title":"MetaDefa: Meta-learning based on Domain Enhancement and Feature\n Alignment for Single Domain Generalization","summary":" The single domain generalization(SDG) based on meta-learning has emerged as\nan effective technique for solving the domain-shift problem. However, the\ninadequate match of data distribution between source and augmented domains and\ndifficult separation of domain-invariant features from domain-related features\nmake SDG model hard to achieve great generalization. Therefore, a novel\nmeta-learning method based on domain enhancement and feature alignment\n(MetaDefa) is proposed to improve the model generalization performance. First,\nthe background substitution and visual corruptions techniques are used to\ngenerate diverse and effective augmented domains. Then, the multi-channel\nfeature alignment module based on class activation maps and class agnostic\nactivation maps is designed to effectively extract adequate transferability\nknowledge. In this module, domain-invariant features can be fully explored by\nfocusing on similar target regions between source and augmented domains feature\nspace and suppressing the feature representation of non-similar target regions.\nExtensive experiments on two publicly available datasets show that MetaDefa has\nsignificant generalization performance advantages in unknown multiple target\ndomains.\n","authors":["Can Sun","Hao Zheng","Zhigang Hu","Liu Yang","Meiguang Zheng","Bo Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15906v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2111.08239v2","updated":"2023-11-27T15:10:18Z","published":"2021-11-16T05:49:56Z","title":"Assessing Deep Neural Networks as Probability Estimators","summary":" Deep Neural Networks (DNNs) have performed admirably in classification tasks.\nHowever, the characterization of their classification uncertainties, required\nfor certain applications, has been lacking. In this work, we investigate the\nissue by assessing DNNs' ability to estimate conditional probabilities and\npropose a framework for systematic uncertainty characterization. Denoting the\ninput sample as x and the category as y, the classification task of assigning a\ncategory y to a given input x can be reduced to the task of estimating the\nconditional probabilities p(y|x), as approximated by the DNN at its last layer\nusing the softmax function. Since softmax yields a vector whose elements all\nfall in the interval (0, 1) and sum to 1, it suggests a probabilistic\ninterpretation to the DNN's outcome. Using synthetic and real-world datasets,\nwe look into the impact of various factors, e.g., probability density f(x) and\ninter-categorical sparsity, on the precision of DNNs' estimations of p(y|x),\nand find that the likelihood probability density and the inter-categorical\nsparsity have greater impacts than the prior probability to DNNs'\nclassification uncertainty.\n","authors":["Yu Pan","Kwo-Sen Kuo","Michael L. Rilee","Hongfeng Yu"],"pdf_url":"https://arxiv.org/pdf/2111.08239v2.pdf","comment":"Y. Pan, K. Kuo, M. Rilee and H. Yu, \"Assessing Deep Neural Networks\n as Probability Estimators,\" in 2021 IEEE International Conference on Big Data\n (Big Data), Orlando, FL, USA, 2021 pp. 1083-1091. doi:\n 10.1109/BigData52589.2021.9671328"},{"id":"http://arxiv.org/abs/2311.15890v1","updated":"2023-11-27T14:56:47Z","published":"2023-11-27T14:56:47Z","title":"Stability-Informed Initialization of Neural Ordinary Differential\n Equations","summary":" This paper addresses the training of Neural Ordinary Differential Equations\n(neural ODEs), and in particular explores the interplay between numerical\nintegration techniques, stability regions, step size, and initialization\ntechniques. It is shown how the choice of integration technique implicitly\nregularizes the learned model, and how the solver's corresponding stability\nregion affects training and prediction performance. From this analysis, a\nstability-informed parameter initialization technique is introduced. The\neffectiveness of the initialization method is displayed across several learning\nbenchmarks and industrial applications.\n","authors":["Theodor Westny","Arman Mohammadi","Daniel Jung","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2311.15890v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15887v1","updated":"2023-11-27T14:55:16Z","published":"2023-11-27T14:55:16Z","title":"FLASC: A Flare-Sensitive Clustering Algorithm: Extending HDBSCAN* for\n Detecting Branches in Clusters","summary":" We present FLASC, an algorithm for flare-sensitive clustering. Our algorithm\nbuilds upon HDBSCAN* -- which provides high-quality density-based clustering\nperformance -- through a post-processing step that differentiates branches\nwithin the detected clusters' manifold, adding a type of pattern that can be\ndiscovered. Two variants of the algorithm are presented, which trade\ncomputational cost for noise robustness. We show that both variants scale\nsimilarly to HDBSCAN* in terms of computational cost and provide stable outputs\nusing synthetic data sets, resulting in an efficient flare-sensitive clustering\nalgorithm. In addition, we demonstrate the algorithm's benefit in data\nexploration over HDBSCAN* clustering on two real-world data sets.\n","authors":["D. M. Bot","J. Peeters","J. Liesenborgs","J. Aerts"],"pdf_url":"https://arxiv.org/pdf/2311.15887v1.pdf","comment":"20 pages, 11 figures, submitted to ACM TKDD"},{"id":"http://arxiv.org/abs/2311.15876v1","updated":"2023-11-27T14:49:06Z","published":"2023-11-27T14:49:06Z","title":"RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation\n and Consistency Regularization","summary":" Recent advancements in Artificial Intelligence (AI) have profoundly\ninfluenced medical fields, by providing tools to reduce clinical workloads.\nHowever, most AI models are constrained to execute uni-modal tasks, in stark\ncontrast to the comprehensive approaches utilized by medical professionals. To\naddress this, here we present RO-LLaMA, a versatile generalist large language\nmodel (LLM) tailored for the field of radiation oncology. This model seamlessly\ncovers a wide range of the workflow of radiation oncologists, adept at various\ntasks such as clinical report summarization, radiation therapy plan suggestion,\nand plan-guided therapy target volume segmentation. In particular, to maximize\nthe end-to-end performance, we further present a novel Consistency Embedding\nFine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional\nerrors at the intermediates while preserving the capability of handling clean\ninputs, and creatively transform this concept into LLM-driven segmentation\nframework as Consistency Embedding Segmentation (CESEG). Experimental results\non multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising\nperformance for diverse tasks with generalization capabilities.\n","authors":["Kwanyoung Kim","Yujin Oh","Sangjoon Park","Hwa Kyung Byun","Jin Sung Kim","Yong Bae Kim","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15876v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15875v1","updated":"2023-11-27T14:48:37Z","published":"2023-11-27T14:48:37Z","title":"Nodal Hydraulic Head Estimation through Unscented Kalman Filter for\n Data-driven Leak Localization in Water Networks","summary":" In this paper, we present a nodal hydraulic head estimation methodology for\nwater distribution networks (WDN) based on an Unscented Kalman Filter (UKF)\nscheme with application to leak localization. The UKF refines an initial\nestimation of the hydraulic state by considering the prediction model, as well\nas available pressure and demand measurements. To this end, it provides\ncustomized prediction and data assimilation steps. Additionally, the method is\nenhanced by dynamically updating the prediction function weight matrices.\nPerformance testing on the Modena benchmark under realistic conditions\ndemonstrates the method's effectiveness in enhancing state estimation and\ndata-driven leak localization.\n","authors":["Luis Romero-Ben","Paul Irofti","Florin Stoican","Vicenç Puig"],"pdf_url":"https://arxiv.org/pdf/2311.15875v1.pdf","comment":"This work has been submitted to IFAC for possible publication. It has\n 6 pages and 3 figures"},{"id":"http://arxiv.org/abs/2306.00349v2","updated":"2023-11-27T14:42:52Z","published":"2023-06-01T05:06:56Z","title":"CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV\n Perception","summary":" Perception is crucial in the realm of autonomous driving systems, where\nbird's eye view (BEV)-based architectures have recently reached\nstate-of-the-art performance. The desirability of self-supervised\nrepresentation learning stems from the expensive and laborious process of\nannotating 2D and 3D data. Although previous research has investigated\npretraining methods for both LiDAR and camera-based 3D object detection, a\nunified pretraining framework for multimodal BEV perception is missing. In this\nstudy, we introduce CALICO, a novel framework that applies contrastive\nobjectives to both LiDAR and camera backbones. Specifically, CALICO\nincorporates two stages: point-region contrast (PRC) and region-aware\ndistillation (RAD). PRC better balances the region- and scene-level\nrepresentation learning on the LiDAR modality and offers significant\nperformance improvement compared to existing methods. RAD effectively achieves\ncontrastive distillation on our self-trained teacher model. CALICO's efficacy\nis substantiated by extensive evaluations on 3D object detection and BEV map\nsegmentation tasks, where it delivers significant performance improvements.\nNotably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and\nmAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection\nagainst adversarial attacks and corruption. Additionally, our framework can be\ntailored to different backbones and heads, positioning it as a promising\napproach for multimodal BEV perception.\n","authors":["Jiachen Sun","Haizhong Zheng","Qingzhao Zhang","Atul Prakash","Z. Morley Mao","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2306.00349v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15176v2","updated":"2023-11-27T14:35:05Z","published":"2023-07-27T20:11:07Z","title":"RCT Rejection Sampling for Causal Estimation Evaluation","summary":" Confounding is a significant obstacle to unbiased estimation of causal\neffects from observational data. For settings with high-dimensional covariates\n-- such as text data, genomics, or the behavioral social sciences --\nresearchers have proposed methods to adjust for confounding by adapting machine\nlearning methods to the goal of causal estimation. However, empirical\nevaluation of these adjustment methods has been challenging and limited. In\nthis work, we build on a promising empirical evaluation strategy that\nsimplifies evaluation design and uses real data: subsampling randomized\ncontrolled trials (RCTs) to create confounded observational datasets while\nusing the average causal effects from the RCTs as ground-truth. We contribute a\nnew sampling algorithm, which we call RCT rejection sampling, and provide\ntheoretical guarantees that causal identification holds in the observational\ndata to allow for valid comparisons to the ground-truth RCT. Using synthetic\ndata, we show our algorithm indeed results in low bias when oracle estimators\nare evaluated on the confounded samples, which is not always the case for a\npreviously proposed algorithm. In addition to this identification result, we\nhighlight several finite data considerations for evaluation designers who plan\nto use RCT rejection sampling on their own datasets. As a proof of concept, we\nimplement an example evaluation pipeline and walk through these finite data\nconsiderations with a novel, real-world RCT -- which we release publicly --\nconsisting of approximately 70k observations and text data as high-dimensional\ncovariates. Together, these contributions build towards a broader agenda of\nimproved empirical evaluation for causal estimation.\n","authors":["Katherine A. Keith","Sergey Feldman","David Jurgens","Jonathan Bragg","Rohit Bhattacharya"],"pdf_url":"https://arxiv.org/pdf/2307.15176v2.pdf","comment":"Code and data at https://github.com/kakeith/rct_rejection_sampling"},{"id":"http://arxiv.org/abs/2311.15865v1","updated":"2023-11-27T14:33:21Z","published":"2023-11-27T14:33:21Z","title":"A precise symbolic emulator of the linear matter power spectrum","summary":" Computing the matter power spectrum, $P(k)$, as a function of cosmological\nparameters can be prohibitively slow in cosmological analyses, hence emulating\nthis calculation is desirable. Previous analytic approximations are\ninsufficiently accurate for modern applications, so black-box, uninterpretable\nemulators are often used. We utilise an efficient genetic programming based\nsymbolic regression framework to explore the space of potential mathematical\nexpressions which can approximate the power spectrum and $\\sigma_8$. We learn\nthe ratio between an existing low-accuracy fitting function for $P(k)$ and that\nobtained by solving the Boltzmann equations and thus still incorporate the\nphysics which motivated this earlier approximation. We obtain an analytic\napproximation to the linear power spectrum with a root mean squared fractional\nerror of 0.2% between $k = 9\\times10^{-3} - 9 \\, h{\\rm \\, Mpc^{-1}}$ and across\na wide range of cosmological parameters, and we provide physical\ninterpretations for various terms in the expression. We also provide a simple\nanalytic approximation for $\\sigma_8$ with a similar accuracy, with a root mean\nsquared fractional error of just 0.4% when evaluated across the same range of\ncosmologies. This function is easily invertible to obtain $A_{\\rm s}$ as a\nfunction of $\\sigma_8$ and the other cosmological parameters, if preferred. It\nis possible to obtain symbolic approximations to a seemingly complex function\nat a precision required for current and future cosmological analyses without\nresorting to deep-learning techniques, thus avoiding their black-box nature and\nlarge number of parameters. Our emulator will be usable long after the codes on\nwhich numerical approximations are built become outdated.\n","authors":["Deaglan J. Bartlett","Lukas Kammerer","Gabriel Kronberger","Harry Desmond","Pedro G. Ferreira","Benjamin D. Wandelt","Bogdan Burlacu","David Alonso","Matteo Zennaro"],"pdf_url":"https://arxiv.org/pdf/2311.15865v1.pdf","comment":"9 pages, 5 figures. Submitted to A&A"},{"id":"http://arxiv.org/abs/2311.15858v1","updated":"2023-11-27T14:25:40Z","published":"2023-11-27T14:25:40Z","title":"Multi-Agent Reinforcement Learning for Power Control in Wireless\n Networks via Adaptive Graphs","summary":" The ever-increasing demand for high-quality and heterogeneous wireless\ncommunication services has driven extensive research on dynamic optimization\nstrategies in wireless networks. Among several possible approaches, multi-agent\ndeep reinforcement learning (MADRL) has emerged as a promising method to\naddress a wide range of complex optimization problems like power control.\nHowever, the seamless application of MADRL to a variety of network optimization\nproblems faces several challenges related to convergence. In this paper, we\npresent the use of graphs as communication-inducing structures among\ndistributed agents as an effective means to mitigate these challenges.\nSpecifically, we harness graph neural networks (GNNs) as neural architectures\nfor policy parameterization to introduce a relational inductive bias in the\ncollective decision-making process. Most importantly, we focus on modeling the\ndynamic interactions among sets of neighboring agents through the introduction\nof innovative methods for defining a graph-induced framework for integrated\ncommunication and learning. Finally, the superior generalization capabilities\nof the proposed methodology to larger networks and to networks with different\nuser categories is verified through simulations.\n","authors":["Lorenzo Mario Amorosa","Marco Skocaj","Roberto Verdone","Deniz Gündüz"],"pdf_url":"https://arxiv.org/pdf/2311.15858v1.pdf","comment":"6 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.15854v1","updated":"2023-11-27T14:21:47Z","published":"2023-11-27T14:21:47Z","title":"A systematic study comparing hyperparameter optimization engines on\n tabular data","summary":" We run an independent comparison of all hyperparameter optimization\n(hyperopt) engines available in the Ray Tune library. We introduce two ways to\nnormalize and aggregate statistics across data sets and models, one rank-based,\nand another one sandwiching the score between the random search score and the\nfull grid search score. This affords us i) to rank the hyperopt engines, ii) to\nmake generalized and statistically significant statements on how much they\nimprove over random search, and iii) to make recommendations on which engine\nshould be used to hyperopt a given learning algorithm. We find that most\nengines beat random search, but that only three of them (HEBO, AX, and\nBlendSearch) clearly stand out. We also found that some engines seem to\nspecialize in hyperopting certain learning algorithms, which makes it tricky to\nuse hyperopt in comparison studies, since the choice of the hyperopt technique\nmay favor some of the models in the comparison.\n","authors":["Balazs Kegl"],"pdf_url":"https://arxiv.org/pdf/2311.15854v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15847v1","updated":"2023-11-27T14:12:51Z","published":"2023-11-27T14:12:51Z","title":"Cell Maps Representation For Lung Adenocarcinoma Growth Patterns\n Classification In Whole Slide Images","summary":" Lung adenocarcinoma is a morphologically heterogeneous disease, characterized\nby five primary histologic growth patterns. The quantity of these patterns can\nbe related to tumor behavior and has a significant impact on patient prognosis.\nIn this work, we propose a novel machine learning pipeline capable of\nclassifying tissue tiles into one of the five patterns or as non-tumor, with an\nArea Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97.\nOur model's strength lies in its comprehensive consideration of cellular\nspatial patterns, where it first generates cell maps from Hematoxylin and Eosin\n(H&E) whole slide images (WSIs), which are then fed into a convolutional neural\nnetwork classification model. Exploiting these cell maps provides the model\nwith robust generalizability to new data, achieving approximately 30% higher\naccuracy on unseen test-sets compared to current state of the art approaches.\nThe insights derived from our model can be used to predict prognosis, enhancing\npatient outcomes.\n","authors":["Arwa Al-Rubaian","Gozde N. Gunesli","Wajd A. Althakfi","Ayesha Azam","Nasir Rajpoot","Shan E Ahmed Raza"],"pdf_url":"https://arxiv.org/pdf/2311.15847v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15838v1","updated":"2023-11-27T14:02:47Z","published":"2023-11-27T14:02:47Z","title":"Utilizing Explainability Techniques for Reinforcement Learning Model\n Assurance","summary":" Explainable Reinforcement Learning (XRL) can provide transparency into the\ndecision-making process of a Deep Reinforcement Learning (DRL) model and\nincrease user trust and adoption in real-world use cases. By utilizing XRL\ntechniques, researchers can identify potential vulnerabilities within a trained\nDRL model prior to deployment, therefore limiting the potential for mission\nfailure or mistakes by the system. This paper introduces the ARLIN (Assured RL\nModel Interrogation) Toolkit, an open-source Python library that identifies\npotential vulnerabilities and critical points within trained DRL models through\ndetailed, human-interpretable explainability outputs. To illustrate ARLIN's\neffectiveness, we provide explainability visualizations and vulnerability\nanalysis for a publicly available DRL model. The open-source code repository is\navailable for download at https://github.com/mitre/arlin.\n","authors":["Alexander Tapley","Kyle Gatesman","Luis Robaina","Brett Bissey","Joseph Weissman"],"pdf_url":"https://arxiv.org/pdf/2311.15838v1.pdf","comment":"9 pages, 8 figures including appendices (A, B, C). Accepted as a\n poster presentation in the demo track at the \"XAI in Action: Past, Present,\n and Future Applications\" workshop at NeurIPS 2023. MITRE Public Release Case\n Number 23-3095"},{"id":"http://arxiv.org/abs/2311.15831v1","updated":"2023-11-27T13:55:21Z","published":"2023-11-27T13:55:21Z","title":"Temporal Action Localization for Inertial-based Human Activity\n Recognition","summary":" A persistent trend in Deep Learning has been the applicability of machine\nlearning concepts to other areas than originally introduced for. As of today,\nstate-of-the-art activity recognition from wearable sensors relies on\nclassifiers being trained on fixed windows of data. Contrarily, video-based\nHuman Activity Recognition has followed a segment-based prediction approach,\nlocalizing activity occurrences from start to end. This paper is the first to\nsystematically demonstrate the applicability of state-of-the-art TAL models for\nwearable Human Activity Recongition (HAR) using raw inertial data as input. Our\nresults show that state-of-the-art TAL models are able to outperform popular\ninertial models on 4 out of 6 wearable activity recognition benchmark datasets,\nwith improvements ranging as much as 25% in F1-score. Introducing the TAL\ncommunity's most popular metric to inertial-based HAR, namely mean Average\nPrecision, our analysis shows that TAL models are able to produce more coherent\nsegments along with an overall higher NULL-class accuracy across all datasets.\nBeing the first to provide such an analysis, the TAL community offers an\ninteresting new perspective to inertial-based HAR with yet to be explored\ndesign choices and training concepts, which could be of significant value for\nthe inertial-based HAR community.\n","authors":["Marius Bock","Michael Moeller","Kristof Van Laerhoven"],"pdf_url":"https://arxiv.org/pdf/2311.15831v1.pdf","comment":"20 pages, 7 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.15816v1","updated":"2023-11-27T13:41:20Z","published":"2023-11-27T13:41:20Z","title":"Scale-Dropout: Estimating Uncertainty in Deep Neural Networks Using\n Stochastic Scale","summary":" Uncertainty estimation in Neural Networks (NNs) is vital in improving\nreliability and confidence in predictions, particularly in safety-critical\napplications. Bayesian Neural Networks (BayNNs) with Dropout as an\napproximation offer a systematic approach to quantifying uncertainty, but they\ninherently suffer from high hardware overhead in terms of power, memory, and\ncomputation. Thus, the applicability of BayNNs to edge devices with limited\nresources or to high-performance applications is challenging. Some of the\ninherent costs of BayNNs can be reduced by accelerating them in hardware on a\nComputation-In-Memory (CIM) architecture with spintronic memories and\nbinarizing their parameters. However, numerous stochastic units are required to\nimplement conventional dropout-based BayNN. In this paper, we propose the Scale\nDropout, a novel regularization technique for Binary Neural Networks (BNNs),\nand Monte Carlo-Scale Dropout (MC-Scale Dropout)-based BayNNs for efficient\nuncertainty estimation. Our approach requires only one stochastic unit for the\nentire model, irrespective of the model size, leading to a highly scalable\nBayesian NN. Furthermore, we introduce a novel Spintronic memory-based CIM\narchitecture for the proposed BayNN that achieves more than $100\\times$ energy\nsavings compared to the state-of-the-art. We validated our method to show up to\na $1\\%$ improvement in predictive performance and superior uncertainty\nestimates compared to related works.\n","authors":["Soyed Tuhin Ahmed","Kamal Danouchi","Michael Hefenbrock","Guillaume Prenat","Lorena Anghel","Mehdi B. Tahoori"],"pdf_url":"https://arxiv.org/pdf/2311.15816v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15807v1","updated":"2023-11-27T13:30:20Z","published":"2023-11-27T13:30:20Z","title":"Exploring Artificial Intelligence Methods for Energy Prediction in\n Healthcare Facilities: An In-Depth Extended Systematic Review","summary":" Hospitals, due to their complexity and unique requirements, play a pivotal\nrole in global energy consumption patterns. This study conducted a\ncomprehensive literature review, utilizing the PRISMA framework, of articles\nthat employed machine learning and artificial intelligence techniques for\npredicting energy consumption in hospital buildings. Of the 1884 publications\nidentified, 17 were found to address this specific domain and have been\nthoroughly reviewed to establish the state-of-the-art and identify gaps where\nfuture research is needed. This review revealed a diverse range of data inputs\ninfluencing energy prediction, with occupancy and meteorological data emerging\nas significant predictors. However, many studies failed to delve deep into the\nimplications of their data choices, and gaps were evident regarding the\nunderstanding of time dynamics, operational status, and preprocessing methods.\nMachine learning, especially deep learning models like ANNs, have shown\npotential in this domain, yet they come with challenges, including\ninterpretability and computational demands. The findings underscore the immense\npotential of AI in optimizing hospital energy consumption but also highlight\nthe need for more comprehensive and granular research. Key areas for future\nresearch include the optimization of ANN approaches, new optimization and data\nintegration techniques, the integration of real-time data into Intelligent\nEnergy Management Systems, and increasing focus on long-term energy\nforecasting.\n","authors":["Marjan FatehiJananloo","Helen Stopps","J. J. McArthur"],"pdf_url":"https://arxiv.org/pdf/2311.15807v1.pdf","comment":"38 pages, 1 figure, 3 tables, systematic literature review"},{"id":"http://arxiv.org/abs/2311.15792v1","updated":"2023-11-27T13:14:39Z","published":"2023-11-27T13:14:39Z","title":"Rethinking Privacy in Machine Learning Pipelines from an Information\n Flow Control Perspective","summary":" Modern machine learning systems use models trained on ever-growing corpora.\nTypically, metadata such as ownership, access control, or licensing information\nis ignored during training. Instead, to mitigate privacy risks, we rely on\ngeneric techniques such as dataset sanitization and differentially private\nmodel training, with inherent privacy/utility trade-offs that hurt model\nperformance. Moreover, these techniques have limitations in scenarios where\nsensitive information is shared across multiple participants and fine-grained\naccess control is required. By ignoring metadata, we therefore miss an\nopportunity to better address security, privacy, and confidentiality\nchallenges. In this paper, we take an information flow control perspective to\ndescribe machine learning systems, which allows us to leverage metadata such as\naccess control policies and define clear-cut privacy and confidentiality\nguarantees with interpretable information flows. Under this perspective, we\ncontrast two different approaches to achieve user-level non-interference: 1)\nfine-tuning per-user models, and 2) retrieval augmented models that access\nuser-specific datasets at inference time. We compare these two approaches to a\ntrivially non-interfering zero-shot baseline using a public model and to a\nbaseline that fine-tunes this model on the whole corpus. We evaluate trained\nmodels on two datasets of scientific articles and demonstrate that retrieval\naugmented architectures deliver the best utility, scalability, and flexibility\nwhile satisfying strict non-interference guarantees.\n","authors":["Lukas Wutschitz","Boris Köpf","Andrew Paverd","Saravan Rajmohan","Ahmed Salem","Shruti Tople","Santiago Zanella-Béguelin","Menglin Xia","Victor Rühle"],"pdf_url":"https://arxiv.org/pdf/2311.15792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01276v2","updated":"2023-11-27T13:02:50Z","published":"2023-11-02T14:44:50Z","title":"Long-Range Neural Atom Learning for Molecular Graphs","summary":" Graph Neural Networks (GNNs) have been widely adopted for drug discovery with\nmolecular graphs. Nevertheless, current GNNs are mainly good at leveraging\nshort-range interactions (SRI) but struggle to capture long-range interactions\n(LRI), both of which are crucial for determining molecular properties. To\ntackle this issue, we propose a method that implicitly projects all original\natoms into a few Neural Atoms, which abstracts the collective information of\natomic groups within a molecule. Specifically, we explicitly exchange the\ninformation among neural atoms and project them back to the atoms'\nrepresentations as an enhancement. With this mechanism, neural atoms establish\nthe communication channels among distant nodes, effectively reducing the\ninteraction scope of arbitrary node pairs into a single hop. To provide an\ninspection of our method from a physical perspective, we reveal its connection\nwith the traditional LRI calculation method, Ewald Summation. We conduct\nextensive experiments on three long-range graph benchmarks, covering both\ngraph-level and link-level tasks on molecular graphs. We empirically justify\nthat our method can be equipped with an arbitrary GNN and help to capture LRI.\n","authors":["Xuan Li","Zhanke Zhou","Jiangchao Yao","Yu Rong","Lu Zhang","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2311.01276v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.10594v2","updated":"2023-11-27T13:02:06Z","published":"2023-03-19T07:53:31Z","title":"AdaptGuard: Defending Against Universal Attacks for Model Adaptation","summary":" Model adaptation aims at solving the domain transfer problem under the\nconstraint of only accessing the pretrained source models. With the increasing\nconsiderations of data privacy and transmission efficiency, this paradigm has\nbeen gaining recent popularity. This paper studies the vulnerability to\nuniversal attacks transferred from the source domain during model adaptation\nalgorithms due to the existence of malicious providers. We explore both\nuniversal adversarial perturbations and backdoor attacks as loopholes on the\nsource side and discover that they still survive in the target models after\nadaptation. To address this issue, we propose a model preprocessing framework,\nnamed AdaptGuard, to improve the security of model adaptation algorithms.\nAdaptGuard avoids direct use of the risky source parameters through knowledge\ndistillation and utilizes the pseudo adversarial samples under adjusted radius\nto enhance the robustness. AdaptGuard is a plug-and-play module that requires\nneither robust pretrained models nor any changes for the following model\nadaptation algorithms. Extensive results on three commonly used datasets and\ntwo popular adaptation methods validate that AdaptGuard can effectively defend\nagainst universal attacks and maintain clean accuracy in the target domain\nsimultaneously. We hope this research will shed light on the safety and\nrobustness of transfer learning. Code is available at\nhttps://github.com/TomSheng21/AdaptGuard.\n","authors":["Lijun Sheng","Jian Liang","Ran He","Zilei Wang","Tieniu Tan"],"pdf_url":"https://arxiv.org/pdf/2303.10594v2.pdf","comment":"ICCV2023"},{"id":"http://arxiv.org/abs/2311.15782v1","updated":"2023-11-27T12:55:39Z","published":"2023-11-27T12:55:39Z","title":"Relationship between Model Compression and Adversarial Robustness: A\n Review of Current Evidence","summary":" Increasing the model capacity is a known approach to enhance the adversarial\nrobustness of deep learning networks. On the other hand, various model\ncompression techniques, including pruning and quantization, can reduce the size\nof the network while preserving its accuracy. Several recent studies have\naddressed the relationship between model compression and adversarial\nrobustness, while some experiments have reported contradictory results. This\nwork summarizes available evidence and discusses possible explanations for the\nobserved effects.\n","authors":["Svetlana Pavlitska","Hannes Grolig","J. Marius Zöllner"],"pdf_url":"https://arxiv.org/pdf/2311.15782v1.pdf","comment":"Accepted for publication at SSCI 2023"},{"id":"http://arxiv.org/abs/2311.15781v1","updated":"2023-11-27T12:54:47Z","published":"2023-11-27T12:54:47Z","title":"Increasing Coverage and Precision of Textual Information in Multilingual\n Knowledge Graphs","summary":" Recent work in Natural Language Processing and Computer Vision has been using\ntextual information -- e.g., entity names and descriptions -- available in\nknowledge graphs to ground neural models to high-quality structured data.\nHowever, when it comes to non-English languages, the quantity and quality of\ntextual information are comparatively scarce. To address this issue, we\nintroduce the novel task of automatic Knowledge Graph Enhancement (KGE) and\nperform a thorough investigation on bridging the gap in both the quantity and\nquality of textual information between English and non-English languages. More\nspecifically, we: i) bring to light the problem of increasing multilingual\ncoverage and precision of entity names and descriptions in Wikidata; ii)\ndemonstrate that state-of-the-art methods, namely, Machine Translation (MT),\nWeb Search (WS), and Large Language Models (LLMs), struggle with this task;\niii) present M-NTA, a novel unsupervised approach that combines MT, WS, and\nLLMs to generate high-quality textual information; and, iv) study the impact of\nincreasing multilingual coverage and precision of non-English textual\ninformation in Entity Linking, Knowledge Graph Completion, and Question\nAnswering. As part of our effort towards better multilingual knowledge graphs,\nwe also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE\napproaches in 10 languages across 7 language families.\n","authors":["Simone Conia","Min Li","Daniel Lee","Umar Farooq Minhas","Ihab Ilyas","Yunyao Li"],"pdf_url":"https://arxiv.org/pdf/2311.15781v1.pdf","comment":"Camera ready for EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15772v1","updated":"2023-11-27T12:44:42Z","published":"2023-11-27T12:44:42Z","title":"Attend Who is Weak: Enhancing Graph Condensation via Cross-Free\n Adversarial Training","summary":" In this paper, we study the \\textit{graph condensation} problem by\ncompressing the large, complex graph into a concise, synthetic representation\nthat preserves the most essential and discriminative information of structure\nand features. We seminally propose the concept of Shock Absorber (a type of\nperturbation) that enhances the robustness and stability of the original graphs\nagainst changes in an adversarial training fashion. Concretely, (I) we forcibly\nmatch the gradients between pre-selected graph neural networks (GNNs) trained\non a synthetic, simplified graph and the original training graph at regularly\nspaced intervals. (II) Before each update synthetic graph point, a Shock\nAbsorber serves as a gradient attacker to maximize the distance between the\nsynthetic dataset and the original graph by selectively perturbing the parts\nthat are underrepresented or insufficiently informative. We iteratively repeat\nthe above two processes (I and II) in an adversarial training fashion to\nmaintain the highly-informative context without losing correlation with the\noriginal dataset. More importantly, our shock absorber and the synthesized\ngraph parallelly share the backward process in a free training manner. Compared\nto the original adversarial training, it introduces almost no additional time\noverhead.\n We validate our framework across 8 datasets (3 graph and 5 node\nclassification datasets) and achieve prominent results: for example, on Cora,\nCiteseer and Ogbn-Arxiv, we can gain nearly 1.13% to 5.03% improvements compare\nwith SOTA models. Moreover, our algorithm adds only about 0.2% to 2.2%\nadditional time overhead over Flicker, Citeseer and Ogbn-Arxiv. Compared to the\ngeneral adversarial training, our approach improves time efficiency by nearly\n4-fold.\n","authors":["Xinglin Li","Kun Wang","Hanhui Deng","Yuxuan Liang","Di Wu"],"pdf_url":"https://arxiv.org/pdf/2311.15772v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16883v2","updated":"2023-11-27T12:33:27Z","published":"2023-09-28T22:41:47Z","title":"The Lipschitz-Variance-Margin Tradeoff for Enhanced Randomized Smoothing","summary":" Real-life applications of deep neural networks are hindered by their unsteady\npredictions when faced with noisy inputs and adversarial attacks. The certified\nradius is in this context a crucial indicator of the robustness of models.\nHowever how to design an efficient classifier with a sufficient certified\nradius? Randomized smoothing provides a promising framework by relying on noise\ninjection in inputs to obtain a smoothed and more robust classifier. In this\npaper, we first show that the variance introduced by randomized smoothing\nclosely interacts with two other important properties of the classifier,\n\\textit{i.e.} its Lipschitz constant and margin. More precisely, our work\nemphasizes the dual impact of the Lipschitz constant of the base classifier, on\nboth the smoothed classifier and the empirical variance. Moreover, to increase\nthe certified robust radius, we introduce a different simplex projection\ntechnique for the base classifier to leverage the variance-margin trade-off\nthanks to Bernstein's concentration inequality, along with an enhanced\nLipschitz bound. Experimental results show a significant improvement in\ncertified accuracy compared to current state-of-the-art methods. Our novel\ncertification procedure allows us to use pre-trained models that are used with\nrandomized smoothing, effectively improving the current certification radius in\na zero-shot manner.\n","authors":["Blaise Delattre","Alexandre Araujo","Quentin Barthélemy","Alexandre Allauzen"],"pdf_url":"https://arxiv.org/pdf/2309.16883v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15756v1","updated":"2023-11-27T12:22:44Z","published":"2023-11-27T12:22:44Z","title":"Learning Multi-Frequency Partial Correlation Graphs","summary":" Despite the large research effort devoted to learning dependencies between\ntime series, the state of the art still faces a major limitation: existing\nmethods learn partial correlations but fail to discriminate across distinct\nfrequency bands. Motivated by many applications in which this differentiation\nis pivotal, we overcome this limitation by learning a block-sparse,\nfrequency-dependent, partial correlation graph, in which layers correspond to\ndifferent frequency bands, and partial correlations can occur over just a few\nlayers. To this aim, we formulate and solve two nonconvex learning problems:\nthe first has a closed-form solution and is suitable when there is prior\nknowledge about the number of partial correlations; the second hinges on an\niterative solution based on successive convex approximation, and is effective\nfor the general case where no prior knowledge is available. Numerical results\non synthetic data show that the proposed methods outperform the current state\nof the art. Finally, the analysis of financial time series confirms that\npartial correlations exist only within a few frequency bands, underscoring how\nour methods enable the gaining of valuable insights that would be undetected\nwithout discriminating along the frequency domain.\n","authors":["Gabriele D'Acunto","Paolo Di Lorenzo","Francesco Bonchi","Stefania Sardellitti","Sergio Barbarossa"],"pdf_url":"https://arxiv.org/pdf/2311.15756v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2102.12920v3","updated":"2023-11-27T12:18:51Z","published":"2021-02-25T15:18:13Z","title":"Emerging Trends in Federated Learning: From Model Fusion to Federated X\n Learning","summary":" Federated learning is a new learning paradigm that decouples data collection\nand model training via multi-party computation and model aggregation. As a\nflexible learning setting, federated learning has the potential to integrate\nwith other learning frameworks. We conduct a focused survey of federated\nlearning in conjunction with other learning algorithms. Specifically, we\nexplore various learning algorithms to improve the vanilla federated averaging\nalgorithm and review model fusion methods such as adaptive aggregation,\nregularization, clustered methods, and Bayesian methods. Following the emerging\ntrends, we also discuss federated learning in the intersection with other\nlearning paradigms, termed federated X learning, where X includes multitask\nlearning, meta-learning, transfer learning, unsupervised learning, and\nreinforcement learning. This survey reviews the state of the art, challenges,\nand future directions.\n","authors":["Shaoxiong Ji","Yue Tan","Teemu Saravirta","Zhiqin Yang","Lauri Vasankari","Shirui Pan","Guodong Long","Anwar Walid"],"pdf_url":"https://arxiv.org/pdf/2102.12920v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.10092v3","updated":"2023-11-27T11:57:36Z","published":"2023-10-16T05:54:30Z","title":"Label Differential Privacy via Aggregation","summary":" In many real-world applications, due to recent developments in the privacy\nlandscape, training data may be aggregated to preserve the privacy of sensitive\ntraining labels. In the learning from label proportions (LLP) framework, the\ndataset is partitioned into bags of feature-vectors which are available only\nwith the sum of the labels per bag. A further restriction, which we call\nlearning from bag aggregates (LBA) is where instead of individual\nfeature-vectors, only the (possibly weighted) sum of the feature-vectors per\nbag is available. We study whether such aggregation techniques can provide\nprivacy guarantees under the notion of label differential privacy (label-DP)\npreviously studied in for e.g. [Chaudhuri-Hsu'11, Ghazi et al.'21, Esfandiari\net al.'22].\n It is easily seen that naive LBA and LLP do not provide label-DP. Our main\nresult however, shows that weighted LBA using iid Gaussian weights with $m$\nrandomly sampled disjoint $k$-sized bags is in fact $(\\varepsilon,\n\\delta)$-label-DP for any $\\varepsilon > 0$ with $\\delta \\approx\n\\exp(-\\Omega(\\sqrt{k}))$ assuming a lower bound on the linear-mse regression\nloss. Further, the $\\ell_2^2$-regressor which minimizes the loss on the\naggregated dataset has a loss within $\\left(1 + o(1)\\right)$-factor of the\noptimum on the original dataset w.p. $\\approx 1 - exp(-\\Omega(m))$. We\nemphasize that no additive label noise is required.\n The analogous weighted-LLP does not however admit label-DP. Nevertheless, we\nshow that if additive $N(0, 1)$ noise can be added to any constant fraction of\nthe instance labels, then the noisy weighted-LLP admits similar label-DP\nguarantees without assumptions on the dataset, while preserving the utility of\nLipschitz-bounded neural mse-regression tasks.\n Our work is the first to demonstrate that label-DP can be achieved by\nrandomly weighted aggregation for regression tasks, using no or little additive\nnoise.\n","authors":["Anand Brahmbhatt","Rishi Saket","Shreyas Havaldar","Anshul Nasery","Aravindan Raghuveer"],"pdf_url":"https://arxiv.org/pdf/2310.10092v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.07983v2","updated":"2023-11-27T11:54:56Z","published":"2023-09-14T18:40:28Z","title":"SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker\n Recognition Systems","summary":" Membership inference attacks allow adversaries to determine whether a\nparticular example was contained in the model's training dataset. While\nprevious works have confirmed the feasibility of such attacks in various\napplications, none has focused on speaker recognition (SR), a promising\nvoice-based biometric recognition technique. In this work, we propose SLMIA-SR,\nthe first membership inference attack tailored to SR. In contrast to\nconventional example-level attack, our attack features speaker-level membership\ninference, i.e., determining if any voices of a given speaker, either the same\nas or different from the given inference voices, have been involved in the\ntraining of a model. It is particularly useful and practical since the training\nand inference voices are usually distinct, and it is also meaningful\nconsidering the open-set nature of SR, namely, the recognition speakers were\noften not present in the training data. We utilize intra-similarity and\ninter-dissimilarity, two training objectives of SR, to characterize the\ndifferences between training and non-training speakers and quantify them with\ntwo groups of features driven by carefully-established feature engineering to\nmount the attack. To improve the generalizability of our attack, we propose a\nnovel mixing ratio training strategy to train attack models. To enhance the\nattack performance, we introduce voice chunk splitting to cope with the limited\nnumber of inference voices and propose to train attack models dependent on the\nnumber of inference voices. Our attack is versatile and can work in both\nwhite-box and black-box scenarios. Additionally, we propose two novel\ntechniques to reduce the number of black-box queries while maintaining the\nattack performance. Extensive experiments demonstrate the effectiveness of\nSLMIA-SR.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song"],"pdf_url":"https://arxiv.org/pdf/2309.07983v2.pdf","comment":"In Proceedings of the 31st Network and Distributed System Security\n (NDSS) Symposium, 2024"},{"id":"http://arxiv.org/abs/2310.01144v2","updated":"2023-11-27T11:54:55Z","published":"2023-10-02T12:32:18Z","title":"The Map Equation Goes Neural","summary":" Community detection and graph clustering are essential for unsupervised data\nexploration and understanding the high-level organisation of networked systems.\nRecently, graph clustering has received attention as a primary task for graph\nneural networks. Although hierarchical graph pooling has been shown to improve\nperformance in graph and node classification tasks, it performs poorly in\nidentifying meaningful clusters. Community detection has a long history in\nnetwork science, but typically relies on optimising objective functions with\ncustom-tailored search algorithms, not leveraging recent advances in deep\nlearning, particularly from graph neural networks. In this paper, we narrow\nthis gap between the deep learning and network science communities. We consider\nthe map equation, an information-theoretic objective function for unsupervised\ncommunity detection. Expressing it in a fully differentiable tensor form that\nproduces soft cluster assignments, we optimise the map equation with deep\nlearning through gradient descent. More specifically, the reformulated map\nequation is a loss function compatible with any graph neural network\narchitecture, enabling flexible clustering and graph pooling that clusters both\ngraph structure and data features in an end-to-end way, automatically finding\nan optimum number of clusters without explicit regularisation by following the\nminimum description length principle. We evaluate our approach experimentally\nusing different neural network architectures for unsupervised clustering in\nsynthetic and real data. Our results show that our approach achieves\ncompetitive performance against baselines, naturally detects overlapping\ncommunities, and avoids over-partitioning sparse graphs.\n","authors":["Christopher Blöcker","Chester Tan","Ingo Scholtes"],"pdf_url":"https://arxiv.org/pdf/2310.01144v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17005v2","updated":"2023-11-27T11:43:53Z","published":"2023-05-26T15:04:06Z","title":"Aggregating Capacity in FL through Successive Layer Training for\n Computationally-Constrained Devices","summary":" Federated learning (FL) is usually performed on resource-constrained edge\ndevices, e.g., with limited memory for the computation. If the required memory\nto train a model exceeds this limit, the device will be excluded from the\ntraining. This can lead to a lower accuracy as valuable data and computation\nresources are excluded from training, also causing bias and unfairness. The FL\ntraining process should be adjusted to such constraints. The state-of-the-art\ntechniques propose training subsets of the FL model at constrained devices,\nreducing their resource requirements for training. But these techniques largely\nlimit the co-adaptation among parameters of the model and are highly\ninefficient, as we show: it is actually better to train a smaller (less\naccurate) model by the system where all the devices can train the model\nend-to-end, than applying such techniques. We propose a new method that enables\nsuccessive freezing and training of the parameters of the FL model at devices,\nreducing the training's resource requirements at the devices, while still\nallowing enough co-adaptation between parameters. We show through extensive\nexperimental evaluation that our technique greatly improves the accuracy of the\ntrained model (by 52.4 p.p.) compared with the state of the art, efficiently\naggregating the computation capacity available on distributed devices.\n","authors":["Kilian Pfeiffer","Ramin Khalili","Jörg Henkel"],"pdf_url":"https://arxiv.org/pdf/2305.17005v2.pdf","comment":"accepted at NeurIPS'23"},{"id":"http://arxiv.org/abs/2311.12530v2","updated":"2023-11-27T11:28:21Z","published":"2023-11-21T11:21:53Z","title":"An efficient likelihood-free Bayesian inference method based on\n sequential neural posterior estimation","summary":" Sequential neural posterior estimation (SNPE) techniques have been recently\nproposed for dealing with simulation-based models with intractable likelihoods.\nUnlike approximate Bayesian computation, SNPE techniques learn the posterior\nfrom sequential simulation using neural network-based conditional density\nestimators by minimizing a specific loss function. The SNPE method proposed by\nLueckmann et al. (2017) used a calibration kernel to boost the sample weights\naround the observed data, resulting in a concentrated loss function. However,\nthe use of calibration kernels may increase the variances of both the empirical\nloss and its gradient, making the training inefficient. To improve the\nstability of SNPE, this paper proposes to use an adaptive calibration kernel\nand several variance reduction techniques. The proposed method greatly speeds\nup the process of training, and provides a better approximation of the\nposterior than the original SNPE method and some existing competitors as\nconfirmed by numerical experiments.\n","authors":["Yifei Xiong","Xiliang Yang","Sanguo Zhang","Zhijian He"],"pdf_url":"https://arxiv.org/pdf/2311.12530v2.pdf","comment":"30 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.15728v1","updated":"2023-11-27T11:26:41Z","published":"2023-11-27T11:26:41Z","title":"Adinkra Symbol Recognition using Classical Machine Learning and Deep\n Learning","summary":" Artificial intelligence (AI) has emerged as a transformative influence,\nengendering paradigm shifts in global societies, spanning academia and\nindustry. However, in light of these rapid advances, addressing the\nunderrepresentation of black communities and African countries in AI is\ncrucial. Boosting enthusiasm for AI can be effectively accomplished by\nshowcasing straightforward applications around tasks like identifying and\ncategorizing traditional symbols, such as Adinkra symbols, or familiar objects\nwithin the community. In this research endeavor, we dived into classical\nmachine learning and harnessed the power of deep learning models to tackle the\nintricate task of classifying and recognizing Adinkra symbols. The idea led to\na newly constructed ADINKRA dataset comprising 174,338 images meticulously\norganized into 62 distinct classes, each representing a singular and emblematic\nsymbol. We constructed a CNN model for classification and recognition using six\nconvolutional layers, three fully connected (FC) layers, and optional dropout\nregularization. The model is a simpler and smaller version of VGG, with fewer\nlayers, smaller channel sizes, and a fixed kernel size. Additionally, we tap\ninto the transfer learning capabilities provided by pre-trained models like VGG\nand ResNet. These models assist us in both classifying images and extracting\nfeatures that can be used with classical machine learning models. We assess the\nmodel's performance by measuring its accuracy and convergence rate and\nvisualizing the areas that significantly influence its predictions. These\nevaluations serve as a foundational benchmark for future assessments of the\nADINKRA dataset. We hope this application exemplar inspires ideas on the\nvarious uses of AI in organizing our traditional and modern lives.\n","authors":["Michael Adjeisah","Kwame Omono Asamoah","Martha Asamoah Yeboah","Raji Rafiu King","Godwin Ferguson Achaab","Kingsley Adjei"],"pdf_url":"https://arxiv.org/pdf/2311.15728v1.pdf","comment":"15 pages, 6 figures, 5 tables"},{"id":"http://arxiv.org/abs/2311.15722v1","updated":"2023-11-27T11:17:20Z","published":"2023-11-27T11:17:20Z","title":"GLIME: General, Stable and Local LIME Explanation","summary":" As black-box machine learning models grow in complexity and find applications\nin high-stakes scenarios, it is imperative to provide explanations for their\npredictions. Although Local Interpretable Model-agnostic Explanations (LIME)\n[22] is a widely adpoted method for understanding model behaviors, it is\nunstable with respect to random seeds [35,24,3] and exhibits low local fidelity\n(i.e., how well the explanation approximates the model's local behaviors)\n[21,16]. Our study shows that this instability problem stems from small sample\nweights, leading to the dominance of regularization and slow convergence.\nAdditionally, LIME's sampling neighborhood is non-local and biased towards the\nreference, resulting in poor local fidelity and sensitivity to reference\nchoice. To tackle these challenges, we introduce GLIME, an enhanced framework\nextending LIME and unifying several prior methods. Within the GLIME framework,\nwe derive an equivalent formulation of LIME that achieves significantly faster\nconvergence and improved stability. By employing a local and unbiased sampling\ndistribution, GLIME generates explanations with higher local fidelity compared\nto LIME. GLIME explanations are independent of reference choice. Moreover,\nGLIME offers users the flexibility to choose a sampling distribution based on\ntheir specific scenarios.\n","authors":["Zeren Tan","Yang Tian","Jian Li"],"pdf_url":"https://arxiv.org/pdf/2311.15722v1.pdf","comment":"Accepted by NeurIPS 2023 as a Spotlight paper"},{"id":"http://arxiv.org/abs/2311.15719v1","updated":"2023-11-27T11:12:33Z","published":"2023-11-27T11:12:33Z","title":"Variational Autoencoders for Feature Exploration and Malignancy\n Prediction of Lung Lesions","summary":" Lung cancer is responsible for 21% of cancer deaths in the UK and five-year\nsurvival rates are heavily influenced by the stage the cancer was identified\nat. Recent studies have demonstrated the capability of AI methods for accurate\nand early diagnosis of lung cancer from routine scans. However, this evidence\nhas not translated into clinical practice with one barrier being a lack of\ninterpretable models. This study investigates the application Variational\nAutoencoders (VAEs), a type of generative AI model, to lung cancer lesions.\nProposed models were trained on lesions extracted from 3D CT scans in the\nLIDC-IDRI public dataset. Latent vector representations of 2D slices produced\nby the VAEs were explored through clustering to justify their quality and used\nin an MLP classifier model for lung cancer diagnosis, the best model achieved\nstate-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows\nthe VAE latent space separates the dataset of malignant and benign lesions\nbased on meaningful feature components including tumour size, shape, patient\nand malignancy class. We also include a comparative analysis of the standard\nGaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces\nthe prior with a Dirichlet distribution to encourage a more explainable latent\nspace with disentangled feature representation. Finally, we demonstrate the\npotential for latent space traversals corresponding to clinically meaningful\nfeature changes.\n","authors":["Benjamin Keel","Aaron Quyn","David Jayne","Samuel D. Relton"],"pdf_url":"https://arxiv.org/pdf/2311.15719v1.pdf","comment":"10 pages (main paper), 5 pages (references), 5 figures, 2 tables,\n work accepted for BMVC 2023"},{"id":"http://arxiv.org/abs/2311.15703v1","updated":"2023-11-27T10:41:28Z","published":"2023-11-27T10:41:28Z","title":"Tabular Two-Dimensional Correlation Analysis for Multifaceted\n Characterization Data","summary":" We propose tabular two-dimensional correlation analysis for extracting\nfeatures from multifaceted characterization data, essential for understanding\nmaterial properties. This method visualizes similarities and phase lags in\nstructural parameter changes through heatmaps, combining hierarchical\nclustering and asynchronous correlations. We applied the proposed method to\ndatasets of carbon nanotube (CNTs) films annealed at various temperatures and\nrevealed the complexity of their hierarchical structures, which include\nelements like voids, bundles, and amorphous carbon. Our analysis addresses the\nchallenge of attempting to understand the sequence of structural changes,\nespecially in multifaceted characterization data where 11 structural parameters\nderived from 8 characterization methods interact with complex behavior. The\nresults show how phase lags (asynchronous changes from stimuli) and parameter\nsimilarities can illuminate the sequence of structural changes in materials,\nproviding insights into phenomena like the removal of amorphous carbon and\ngraphitization in annealed CNTs. This approach is beneficial even with limited\ndata and holds promise for a wide range of material analyses, demonstrating its\npotential in elucidating complex material behaviors and properties.\n","authors":["Shun Muroga","Satoshi Yamazaki","Koji Michishio","Hideaki Nakajima","Takahiro Morimoto","Nagayasu Oshima","Kazufumi Kobashi","Toshiya Okazaki"],"pdf_url":"https://arxiv.org/pdf/2311.15703v1.pdf","comment":"15 pages, 4 figures"},{"id":"http://arxiv.org/abs/2310.01825v2","updated":"2023-11-27T10:39:13Z","published":"2023-10-03T06:42:28Z","title":"Empirical Study of PEFT techniques for Winter Wheat Segmentation","summary":" Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced\nsignificant growth and have been extensively employed to adapt large vision and\nlanguage models to various domains, enabling satisfactory model performance\nwith minimal computational needs. Despite these advances, more research has yet\nto delve into potential PEFT applications in real-life scenarios, particularly\nin the critical domains of remote sensing and crop monitoring. The diversity of\nclimates across different regions and the need for comprehensive large-scale\ndatasets have posed significant obstacles to accurately identify crop types\nacross varying geographic locations and changing growing seasons. This study\nseeks to bridge this gap by comprehensively exploring the feasibility of\ncross-area and cross-year out-of-distribution generalization using the\nState-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to\nexplore PEFT approaches for crop monitoring. Specifically, we focus on adapting\nthe SOTA TSViT model to address winter wheat field segmentation, a critical\ntask for crop monitoring and food security. This adaptation process involves\nintegrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and\nprompt tuning. Using PEFT techniques, we achieved notable results comparable to\nthose achieved using full fine-tuning methods while training only a mere 0.7%\nparameters of the whole TSViT architecture. The in-house labeled data-set,\nreferred to as the Beqaa-Lebanon dataset, comprises high-quality annotated\npolygons for wheat and non-wheat classes with a total surface of 170 kmsq, over\nfive consecutive years. Using Sentinel-2 images, our model achieved a 84%\nF1-score. We intend to publicly release the Lebanese winter wheat data set,\ncode repository, and model weights.\n","authors":["Mohamad Hasan Zahweh","Hasan Nasrallah","Mustafa Shukor","Ghaleb Faour","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01825v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15696v1","updated":"2023-11-27T10:32:31Z","published":"2023-11-27T10:32:31Z","title":"Peptide Binding Classification on Quantum Computers","summary":" We conduct an extensive study on using near-term quantum computers for a task\nin the domain of computational biology. By constructing quantum models based on\nparameterised quantum circuits we perform sequence classification on a task\nrelevant to the design of therapeutic proteins, and find competitive\nperformance with classical baselines of similar scale. To study the effect of\nnoise, we run some of the best-performing quantum models with favourable\nresource requirements on emulators of state-of-the-art noisy quantum\nprocessors. We then apply error mitigation methods to improve the signal. We\nfurther execute these quantum models on the Quantinuum H1-1 trapped-ion quantum\nprocessor and observe very close agreement with noiseless exact simulation.\nFinally, we perform feature attribution methods and find that the quantum\nmodels indeed identify sensible relationships, at least as well as the\nclassical baselines. This work constitutes the first proof-of-concept\napplication of near-term quantum computing to a task critical to the design of\ntherapeutic proteins, opening the route toward larger-scale applications in\nthis and related fields, in line with the hardware development roadmaps of\nnear-term quantum technologies.\n","authors":["Charles London","Douglas Brown","Wenduan Xu","Sezen Vatansever","Christopher James Langmead","Dimitri Kartsaklis","Stephen Clark","Konstantinos Meichanetzidis"],"pdf_url":"https://arxiv.org/pdf/2311.15696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15691v1","updated":"2023-11-27T10:28:44Z","published":"2023-11-27T10:28:44Z","title":"Automated discovery of trade-off between utility, privacy and fairness\n in machine learning models","summary":" Machine learning models are deployed as a central component in decision\nmaking and policy operations with direct impact on individuals' lives. In order\nto act ethically and comply with government regulations, these models need to\nmake fair decisions and protect the users' privacy. However, such requirements\ncan come with decrease in models' performance compared to their potentially\nbiased, privacy-leaking counterparts. Thus the trade-off between fairness,\nprivacy and performance of ML models emerges, and practitioners need a way of\nquantifying this trade-off to enable deployment decisions. In this work we\ninterpret this trade-off as a multi-objective optimization problem, and propose\nPFairDP, a pipeline that uses Bayesian optimization for discovery of\nPareto-optimal points between fairness, privacy and utility of ML models. We\nshow how PFairDP can be used to replicate known results that were achieved\nthrough manual constraint setting process. We further demonstrate effectiveness\nof PFairDP with experiments on multiple models and datasets.\n","authors":["Bogdan Ficiu","Neil D. Lawrence","Andrei Paleyes"],"pdf_url":"https://arxiv.org/pdf/2311.15691v1.pdf","comment":"3rd Workshop on Bias and Fairness in AI (BIAS), ECML 2023"},{"id":"http://arxiv.org/abs/2306.07294v2","updated":"2023-11-27T10:21:24Z","published":"2023-06-10T11:25:31Z","title":"Computational and Storage Efficient Quadratic Neurons for Deep Neural\n Networks","summary":" Deep neural networks (DNNs) have been widely deployed across diverse domains\nsuch as computer vision and natural language processing. However, the\nimpressive accomplishments of DNNs have been realized alongside extensive\ncomputational demands, thereby impeding their applicability on\nresource-constrained devices. To address this challenge, many researchers have\nbeen focusing on basic neuron structures, the fundamental building blocks of\nneural networks, to alleviate the computational and storage cost. In this work,\nan efficient quadratic neuron architecture distinguished by its enhanced\nutilization of second-order computational information is introduced. By virtue\nof their better expressivity, DNNs employing the proposed quadratic neurons can\nattain similar accuracy with fewer neurons and computational cost. Experimental\nresults have demonstrated that the proposed quadratic neuron structure exhibits\nsuperior computational and storage efficiency across various tasks when\ncompared with both linear and non-linear neurons in prior work.\n","authors":["Chuangtao Chen","Grace Li Zhang","Xunzhao Yin","Cheng Zhuo","Ulf Schlichtmann","Bing Li"],"pdf_url":"https://arxiv.org/pdf/2306.07294v2.pdf","comment":"Accepted by Design Automation and Test in Europe (DATE) 2024"},{"id":"http://arxiv.org/abs/2311.15685v1","updated":"2023-11-27T10:18:17Z","published":"2023-11-27T10:18:17Z","title":"The Battleship Approach to the Low Resource Entity Matching Problem","summary":" Entity matching, a core data integration problem, is the task of deciding\nwhether two data tuples refer to the same real-world entity. Recent advances in\ndeep learning methods, using pre-trained language models, were proposed for\nresolving entity matching. Although demonstrating unprecedented results, these\nsolutions suffer from a major drawback as they require large amounts of labeled\ndata for training, and, as such, are inadequate to be applied to low resource\nentity matching problems. To overcome the challenge of obtaining sufficient\nlabeled data we offer a new active learning approach, focusing on a selection\nmechanism that exploits unique properties of entity matching. We argue that a\ndistributed representation of a tuple pair indicates its informativeness when\nconsidered among other pairs. This is used consequently in our approach that\niteratively utilizes space-aware considerations. Bringing it all together, we\ntreat the low resource entity matching problem as a Battleship game, hunting\nindicative samples, focusing on positive ones, through awareness of the latent\nspace along with careful planning of next sampling iterations. An extensive\nexperimental analysis shows that the proposed algorithm outperforms\nstate-of-the-art active learning solutions to low resource entity matching, and\nalthough using less samples, can be as successful as state-of-the-art fully\ntrained known algorithms.\n","authors":["Bar Genossar","Avigdor Gal","Roee Shraga"],"pdf_url":"https://arxiv.org/pdf/2311.15685v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15682v1","updated":"2023-11-27T10:16:22Z","published":"2023-11-27T10:16:22Z","title":"Information theoretic study of the neural geometry induced by category\n learning","summary":" Categorization is an important topic both for biological and artificial\nneural networks. Here, we take an information theoretic approach to assess the\nefficiency of the representations induced by category learning. We show that\none can decompose the relevant Bayesian cost into two components, one for the\ncoding part and one for the decoding part. Minimizing the coding cost implies\nmaximizing the mutual information between the set of categories and the neural\nactivities. We analytically show that this mutual information can be written as\nthe sum of two terms that can be interpreted as (i) finding an appropriate\nrepresentation space, and, (ii) building a representation with the appropriate\nmetrics, based on the neural Fisher information on this space. One main\nconsequence is that category learning induces an expansion of neural space near\ndecision boundaries. Finally, we provide numerical illustrations that show how\nFisher information of the coding neural population aligns with the boundaries\nbetween categories.\n","authors":["Laurent Bonnasse-Gahot","Jean-Pierre Nadal"],"pdf_url":"https://arxiv.org/pdf/2311.15682v1.pdf","comment":"7 pages, 2 figures, Accepted (Oral) to InfoCog@NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.15673v1","updated":"2023-11-27T10:02:12Z","published":"2023-11-27T10:02:12Z","title":"Accelerating Hierarchical Associative Memory: A Deep Equilibrium\n Approach","summary":" Hierarchical Associative Memory models have recently been proposed as a\nversatile extension of continuous Hopfield networks. In order to facilitate\nfuture research on such models, especially at scale, we focus on increasing\ntheir simulation efficiency on digital hardware. In particular, we propose two\nstrategies to speed up memory retrieval in these models, which corresponds to\ntheir use at inference, but is equally important during training. First, we\nshow how they can be cast as Deep Equilibrium Models, which allows using faster\nand more stable solvers. Second, inspired by earlier work, we show that\nalternating optimization of the even and odd layers accelerates memory\nretrieval by a factor close to two. Combined, these two techniques allow for a\nmuch faster energy minimization, as shown in our proof-of-concept experimental\nresults. The code is available at https://github.com/cgoemaere/hamdeq\n","authors":["Cédric Goemaere","Johannes Deleu","Thomas Demeester"],"pdf_url":"https://arxiv.org/pdf/2311.15673v1.pdf","comment":"Accepted at the \"Associative Memory & Hopfield Networks'' workshop at\n NeurIPS, 2023"},{"id":"http://arxiv.org/abs/2311.13959v2","updated":"2023-11-27T09:47:10Z","published":"2023-11-23T12:17:45Z","title":"RankFeat&RankWeight: Rank-1 Feature/Weight Removal for\n Out-of-distribution Detection","summary":" The task of out-of-distribution (OOD) detection is crucial for deploying\nmachine learning models in real-world settings. In this paper, we observe that\nthe singular value distributions of the in-distribution (ID) and OOD features\nare quite different: the OOD feature matrix tends to have a larger dominant\nsingular value than the ID feature, and the class predictions of OOD samples\nare largely determined by it. This observation motivates us to propose\n\\texttt{RankFeat}, a simple yet effective \\emph{post hoc} approach for OOD\ndetection by removing the rank-1 matrix composed of the largest singular value\nand the associated singular vectors from the high-level feature.\n\\texttt{RankFeat} achieves \\emph{state-of-the-art} performance and reduces the\naverage false positive rate (FPR95) by 17.90\\% compared with the previous best\nmethod. The success of \\texttt{RankFeat} motivates us to investigate whether a\nsimilar phenomenon would exist in the parameter matrices of neural networks. We\nthus propose \\texttt{RankWeight} which removes the rank-1 weight from the\nparameter matrices of a single deep layer. Our \\texttt{RankWeight}is also\n\\emph{post hoc} and only requires computing the rank-1 matrix once. As a\nstandalone approach, \\texttt{RankWeight} has very competitive performance\nagainst other methods across various backbones. Moreover, \\texttt{RankWeight}\nenjoys flexible compatibility with a wide range of OOD detection methods. The\ncombination of \\texttt{RankWeight} and \\texttt{RankFeat} refreshes the new\n\\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\\% on\nthe ImageNet-1k benchmark. Extensive ablation studies and comprehensive\ntheoretical analyses are presented to support the empirical results.\n","authors":["Yue Song","Nicu Sebe","Wei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13959v2.pdf","comment":"submitted to T-PAMI. arXiv admin note: substantial text overlap with\n arXiv:2209.08590"},{"id":"http://arxiv.org/abs/2311.13265v2","updated":"2023-11-27T09:40:19Z","published":"2023-11-22T09:31:19Z","title":"Improved identification accuracy in equation learning via comprehensive\n $\\boldsymbol{R^2}$-elimination and Bayesian model selection","summary":" In the field of equation learning, exhaustively considering all possible\nequations derived from a basis function dictionary is infeasible. Sparse\nregression and greedy algorithms have emerged as popular approaches to tackle\nthis challenge. However, the presence of multicollinearity poses difficulties\nfor sparse regression techniques, and greedy steps may inadvertently exclude\nterms of the true equation, leading to reduced identification accuracy. In this\narticle, we present an approach that strikes a balance between\ncomprehensiveness and efficiency in equation learning. Inspired by stepwise\nregression, our approach combines the coefficient of determination, $R^2$, and\nthe Bayesian model evidence, $p(\\boldsymbol y|\\mathcal M)$, in a novel way. Our\nprocedure is characterized by a comprehensive search with just a minor\nreduction of the model space at each iteration step. With two flavors of our\napproach and the adoption of $p(\\boldsymbol y|\\mathcal M)$ for bi-directional\nstepwise regression, we present a total of three new avenues for equation\nlearning. Through three extensive numerical experiments involving random\npolynomials and dynamical systems, we compare our approach against four\nstate-of-the-art methods and two standard approaches. The results demonstrate\nthat our comprehensive search approach surpasses all other methods in terms of\nidentification accuracy. In particular, the second flavor of our approach\nestablishes an efficient overfitting penalty solely based on $R^2$, which\nachieves highest rates of exact equation recovery.\n","authors":["Daniel Nickelsen","Bubacarr Bah"],"pdf_url":"https://arxiv.org/pdf/2311.13265v2.pdf","comment":"12 pages main text and 11 pages appendix, Published in TMLR\n (https://openreview.net/forum?id=0ck7hJ8EVC)"},{"id":"http://arxiv.org/abs/2311.15658v1","updated":"2023-11-27T09:40:14Z","published":"2023-11-27T09:40:14Z","title":"Regularization by Texts for Latent Diffusion Inverse Solvers","summary":" The recent advent of diffusion models has led to significant progress in\nsolving inverse problems, leveraging these models as effective generative\npriors. Nonetheless, challenges related to the ill-posed nature of such\nproblems remain, often due to inherent ambiguities in measurements. Drawing\ninspiration from the human ability to resolve visual ambiguities through\nperceptual biases, here we introduce a novel latent diffusion inverse solver by\nincorporating regularization by texts (TReg). Specifically, TReg applies the\ntextual description of the preconception of the solution during the reverse\nsampling phase, of which description isndynamically reinforced through\nnull-text optimization for adaptive negation. Our comprehensive experimental\nresults demonstrate that TReg successfully mitigates ambiguity in latent\ndiffusion inverse solvers, enhancing their effectiveness and accuracy.\n","authors":["Jeongsol Kim","Geon Yeong Park","Hyungjin Chung","Jong Chul Ye"],"pdf_url":"https://arxiv.org/pdf/2311.15658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15654v1","updated":"2023-11-27T09:33:56Z","published":"2023-11-27T09:33:56Z","title":"Universal Event Detection in Time Series","summary":" In our previously published work, we introduced a supervised deep learning\nmethod for event detection in multivariate time series data, employing\nregression instead of binary classification. This simplification avoids the\nneed for point-wise labels throughout the entire dataset, relying solely on\nground truth events defined as time points or intervals. In this paper, we\nestablish mathematically that our method is universal, and capable of detecting\nany type of event with arbitrary precision under mild continuity assumptions on\nthe time series. These events may encompass change points, frauds, anomalies,\nphysical occurrences, and more. We substantiate our theoretical results using\nthe universal approximation theorem for feed-forward neural networks (FFN).\nAdditionally, we provide empirical validations that confirm our claims,\ndemonstrating that our method, with a limited number of parameters, outperforms\nother deep learning approaches, particularly for rare events and imbalanced\ndatasets from different domains.\n","authors":["Menouar Azib","Benjamin Renard","Philippe Garnier","Vincent Génot","Nicolas André"],"pdf_url":"https://arxiv.org/pdf/2311.15654v1.pdf","comment":"To be submitted to IEEE Transactions on Neural Networks and Learning\n Systems"},{"id":"http://arxiv.org/abs/2311.15649v1","updated":"2023-11-27T09:20:23Z","published":"2023-11-27T09:20:23Z","title":"RoboGPT: an intelligent agent of making embodied long-term decisions for\n daily instruction tasks","summary":" Robotic agents must master common sense and long-term sequential decisions to\nsolve daily tasks through natural language instruction. The developments in\nLarge Language Models (LLMs) in natural language processing have inspired\nefforts to use LLMs in complex robot planning. Despite LLMs' great\ngeneralization and comprehension of instruction tasks, LLMs-generated task\nplans sometimes lack feasibility and correctness. To address the problem, we\npropose a RoboGPT agent\\footnote{our code and dataset will be released soon}\nfor making embodied long-term decisions for daily tasks, with two modules: 1)\nLLMs-based planning with re-plan to break the task into multiple sub-goals; 2)\nRoboSkill individually designed for sub-goals to learn better navigation and\nmanipulation skills. The LLMs-based planning is enhanced with a new robotic\ndataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily\ninstruction tasks is gathered for fine-tuning the Llama model and obtaining\nRoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily\ninstruction tasks. Additionally, a low-computational Re-Plan module is designed\nto allow plans to flexibly adapt to the environment, thereby addressing the\nnomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA\nmethods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA\nLLM-based planners like ChatGPT in task-planning rationality for hundreds of\nunseen daily tasks, and even other domain tasks, while keeping the large\nmodel's original broad application and generality.\n","authors":["Yaran Chen","Wenbo Cui","Yuanwen Chen","Mining Tan","Xinyao Zhang","Dongbin Zhao","He Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15648v1","updated":"2023-11-27T09:20:12Z","published":"2023-11-27T09:20:12Z","title":"Reinforcement Learning from Diffusion Feedback: Q* for Image Search","summary":" Large vision-language models are steadily gaining personalization\ncapabilities at the cost of fine-tuning or data augmentation. We present two\nmodels for image generation using model-agnostic learning that align semantic\npriors with generative capabilities. RLDF, or Reinforcement Learning from\nDiffusion Feedback, is a singular approach for visual imitation through\nprior-preserving reward function guidance. This employs Q-learning (with\nstandard Q*) for generation and follows a semantic-rewarded trajectory for\nimage search through finite encoding-tailored actions. The second proposed\nmethod, noisy diffusion gradient, is optimization driven. At the root of both\nmethods is a special CFG encoding that we propose for continual semantic\nguidance. Using only a single input image and no text input, RLDF generates\nhigh-quality images over varied domains including retail, sports and\nagriculture showcasing class-consistency and strong visual diversity. Project\nwebsite is available at https://infernolia.github.io/RLDF.\n","authors":["Aboli Marathe"],"pdf_url":"https://arxiv.org/pdf/2311.15648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15647v1","updated":"2023-11-27T09:19:01Z","published":"2023-11-27T09:19:01Z","title":"Bandits Meet Mechanism Design to Combat Clickbait in Online\n Recommendation","summary":" We study a strategic variant of the multi-armed bandit problem, which we coin\nthe strategic click-bandit. This model is motivated by applications in online\nrecommendation where the choice of recommended items depends on both the\nclick-through rates and the post-click rewards. Like in classical bandits,\nrewards follow a fixed unknown distribution. However, we assume that the\nclick-rate of each arm is chosen strategically by the arm (e.g., a host on\nAirbnb) in order to maximize the number of times it gets clicked. The algorithm\ndesigner does not know the post-click rewards nor the arms' actions (i.e.,\nstrategically chosen click-rates) in advance, and must learn both values over\ntime. To solve this problem, we design an incentive-aware learning algorithm,\nUCB-S, which achieves two goals simultaneously: (a) incentivizing desirable arm\nbehavior under uncertainty; (b) minimizing regret by learning unknown\nparameters. We characterize all approximate Nash equilibria among arms under\nUCB-S and show a $\\tilde{\\mathcal{O}} (\\sqrt{KT})$ regret bound uniformly in\nevery equilibrium. We also show that incentive-unaware algorithms generally\nfail to achieve low regret in the strategic click-bandit. Finally, we support\nour theoretical results by simulations of strategic arm behavior which confirm\nthe effectiveness and robustness of our proposed incentive design.\n","authors":["Thomas Kleine Buening","Aadirupa Saha","Christos Dimitrakakis","Haifeng Xu"],"pdf_url":"https://arxiv.org/pdf/2311.15647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.14969v2","updated":"2023-11-27T09:06:55Z","published":"2023-08-29T01:47:49Z","title":"Uncovering the Hidden Cost of Model Compression","summary":" In the era of resource-intensive foundation models, efficient adaptation in\ndownstream tasks has become paramount. Visual Prompting (VP), inspired by\nprompting in Large Language Models (LLMs), has emerged as a key transfer\nlearning method in computer vision. Aligned with the growing significance of\nefficiency, research in model compression has become pivotal to alleviate the\ncomputational burden in both training and deploying over-parameterized neural\nnetworks. A key goal in model compression is the development of sparse models\ncapable of matching or surpassing the performance of their over-parameterized,\ndense counterparts. While prior research has explored the impact of model\nsparsity on transfer learning, its effects on visual prompting-based transfer\nremain unclear. This study addresses this gap, revealing that model sparsity\nadversely affects the performance of visual prompting-based transfer,\nparticularly in low-data-volume scenarios. Furthermore, our findings highlight\nthe negative influence of sparsity on the calibration of downstream\nvisual-prompted models. This empirical exploration calls for a nuanced\nunderstanding beyond accuracy in sparse settings, opening avenues for further\nresearch in Visual Prompting for sparse models. Code and logs can be accessed\nat https://github.com/landskape-ai/Reprogram_LT .\n","authors":["Diganta Misra","Agam Goyal","Bharat Runwal","Pin Yu Chen"],"pdf_url":"https://arxiv.org/pdf/2308.14969v2.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2311.15623v1","updated":"2023-11-27T08:38:42Z","published":"2023-11-27T08:38:42Z","title":"Injecting linguistic knowledge into BERT for Dialogue State Tracking","summary":" Dialogue State Tracking (DST) models often employ intricate neural network\narchitectures, necessitating substantial training data, and their inference\nprocesses lack transparency. This paper proposes a method that extracts\nlinguistic knowledge via an unsupervised framework and subsequently utilizes\nthis knowledge to augment BERT's performance and interpretability in DST tasks.\nThe knowledge extraction procedure is computationally economical and does not\nnecessitate annotations or additional training data. The injection of the\nextracted knowledge necessitates the addition of only simple neural modules. We\nemploy the Convex Polytopic Model (CPM) as a feature extraction tool for DST\ntasks and illustrate that the acquired features correlate with the syntactic\nand semantic patterns in the dialogues. This correlation facilitates a\ncomprehensive understanding of the linguistic features influencing the DST\nmodel's decision-making process. We benchmark this framework on various DST\ntasks and observe a notable improvement in accuracy.\n","authors":["Xiaohan Feng","Xixin Wu","Helen Meng"],"pdf_url":"https://arxiv.org/pdf/2311.15623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15617v1","updated":"2023-11-27T08:28:08Z","published":"2023-11-27T08:28:08Z","title":"VeryFL: A Verify Federated Learning Framework Embedded with Blockchain","summary":" Blockchain-empowered federated learning (FL) has provoked extensive research\nrecently. Various blockchain-based federated learning algorithm, architecture\nand mechanism have been designed to solve issues like single point failure and\ndata falsification brought by centralized FL paradigm. Moreover, it is easier\nto allocate incentives to nodes with the help of the blockchain. Various\ncentralized federated learning frameworks like FedML, have emerged in the\ncommunity to help boost the research on FL. However, decentralized\nblockchain-based federated learning framework is still missing, which cause\ninconvenience for researcher to reproduce or verify the algorithm performance\nbased on blockchain. Inspired by the above issues, we have designed and\ndeveloped a blockchain-based federated learning framework by embedding Ethereum\nnetwork. This report will present the overall structure of this framework,\nwhich proposes a code practice paradigm for the combination of FL with\nblockchain and, at the same time, compatible with normal FL training task. In\naddition to implement some blockchain federated learning algorithms on smart\ncontract to help execute a FL training, we also propose a model ownership\nauthentication architecture based on blockchain and model watermarking to\nprotect the intellectual property rights of models. These mechanism on\nblockchain shows an underlying support of blockchain for federated learning to\nprovide a verifiable training, aggregation and incentive distribution procedure\nand thus we named this framework VeryFL (A Verify Federated Learninig Framework\nEmbedded with Blockchain). The source code is avaliable on\nhttps://github.com/GTMLLab/VeryFL.\n","authors":["Yihao Li","Yanyi Lai","Chuan Chen","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2311.15617v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15610v1","updated":"2023-11-27T08:10:53Z","published":"2023-11-27T08:10:53Z","title":"Bayesian Approach to Linear Bayesian Networks","summary":" This study proposes the first Bayesian approach for learning high-dimensional\nlinear Bayesian networks. The proposed approach iteratively estimates each\nelement of the topological ordering from backward and its parent using the\ninverse of a partial covariance matrix. The proposed method successfully\nrecovers the underlying structure when Bayesian regularization for the inverse\ncovariance matrix with unequal shrinkage is applied. Specifically, it shows\nthat the number of samples $n = \\Omega( d_M^2 \\log p)$ and $n = \\Omega(d_M^2\np^{2/m})$ are sufficient for the proposed algorithm to learn linear Bayesian\nnetworks with sub-Gaussian and 4m-th bounded-moment error distributions,\nrespectively, where $p$ is the number of nodes and $d_M$ is the maximum degree\nof the moralized graph. The theoretical findings are supported by extensive\nsimulation studies including real data analysis. Furthermore the proposed\nmethod is demonstrated to outperform state-of-the-art frequentist approaches,\nsuch as the BHLSM, LISTEN, and TD algorithms in synthetic data.\n","authors":["Seyong Hwang","Kyoungjae Lee","Sunmin Oh","Gunwoong Park"],"pdf_url":"https://arxiv.org/pdf/2311.15610v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.03543v2","updated":"2023-11-27T08:09:20Z","published":"2021-03-05T08:45:43Z","title":"Artificial Neural Networks generated by Low Discrepancy Sequences","summary":" Artificial neural networks can be represented by paths. Generated as random\nwalks on a dense network graph, we find that the resulting sparse networks\nallow for deterministic initialization and even weights with fixed sign. Such\nnetworks can be trained sparse from scratch, avoiding the expensive procedure\nof training a dense network and compressing it afterwards. Although sparse,\nweights are accessed as contiguous blocks of memory. In addition, enumerating\nthe paths using deterministic low discrepancy sequences, for example the Sobol'\nsequence, amounts to connecting the layers of neural units by progressive\npermutations, which naturally avoids bank conflicts in parallel computer\nhardware. We demonstrate that the artificial neural networks generated by low\ndiscrepancy sequences can achieve an accuracy within reach of their dense\ncounterparts at a much lower computational complexity.\n","authors":["Alexander Keller","Matthijs Van keirsbilck"],"pdf_url":"https://arxiv.org/pdf/2103.03543v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15609v1","updated":"2023-11-27T08:06:56Z","published":"2023-11-27T08:06:56Z","title":"A manometric feature descriptor with linear-SVM to distinguish\n esophageal contraction vigor","summary":" n clinical, if a patient presents with nonmechanical obstructive dysphagia,\nesophageal chest pain, and gastro esophageal reflux symptoms, the physician\nwill usually assess the esophageal dynamic function. High-resolution manometry\n(HRM) is a clinically commonly used technique for detection of esophageal\ndynamic function comprehensively and objectively. However, after the results of\nHRM are obtained, doctors still need to evaluate by a variety of parameters.\nThis work is burdensome, and the process is complex. We conducted image\nprocessing of HRM to predict the esophageal contraction vigor for assisting the\nevaluation of esophageal dynamic function. Firstly, we used Feature-Extraction\nand Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow\n(PoS) to further extract higher-order features. Then we determine the\nclassification of esophageal contraction vigor normal, weak and failed by using\nlinear-SVM according to these features. Our data set includes 3000 training\nsets, 500 validation sets and 411 test sets. After verification our accuracy\nreaches 86.83%, which is higher than other common machine learning methods.\n","authors":["Jialin Liu","Lu Yan","Xiaowei Liu","Yuzhuo Dai","Fanggen Lu","Yuanting Ma","Muzhou Hou","Zheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14468v2","updated":"2023-11-27T08:04:04Z","published":"2023-11-24T13:21:35Z","title":"Efficient Gradient Estimation via Adaptive Sampling and Importance\n Sampling","summary":" Machine learning problems rely heavily on stochastic gradient descent (SGD)\nfor optimization. The effectiveness of SGD is contingent upon accurately\nestimating gradients from a mini-batch of data samples. Instead of the commonly\nused uniform sampling, adaptive or importance sampling reduces noise in\ngradient estimation by forming mini-batches that prioritize crucial data\npoints. Previous research has suggested that data points should be selected\nwith probabilities proportional to their gradient norm. Nevertheless, existing\nalgorithms have struggled to efficiently integrate importance sampling into\nmachine learning frameworks. In this work, we make two contributions. First, we\npresent an algorithm that can incorporate existing importance functions into\nour framework. Second, we propose a simplified importance function that relies\nsolely on the loss gradient of the output layer. By leveraging our proposed\ngradient estimation techniques, we observe improved convergence in\nclassification and regression tasks with minimal computational overhead. We\nvalidate the effectiveness of our adaptive and importance-sampling approach on\nimage and point-cloud datasets.\n","authors":["Corentin Salaün","Xingchang Huang","Iliyan Georgiev","Niloy J. Mitra","Gurprit Singh"],"pdf_url":"https://arxiv.org/pdf/2311.14468v2.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.15603v1","updated":"2023-11-27T07:53:44Z","published":"2023-11-27T07:53:44Z","title":"QuickDrop: Efficient Federated Unlearning by Integrated Dataset\n Distillation","summary":" Federated Unlearning (FU) aims to delete specific training data from an ML\nmodel trained using Federated Learning (FL). We introduce QuickDrop, an\nefficient and original FU method that utilizes dataset distillation (DD) to\naccelerate unlearning and drastically reduces computational overhead compared\nto existing approaches. In QuickDrop, each client uses DD to generate a compact\ndataset representative of the original training dataset, called a distilled\ndataset, and uses this compact dataset during unlearning. To unlearn specific\nknowledge from the global model, QuickDrop has clients execute Stochastic\nGradient Ascent with samples from the distilled datasets, thus significantly\nreducing computational overhead compared to conventional FU methods. We further\nincrease the efficiency of QuickDrop by ingeniously integrating DD into the FL\ntraining process. By reusing the gradient updates produced during FL training\nfor DD, the overhead of creating distilled datasets becomes close to\nnegligible. Evaluations on three standard datasets show that, with comparable\naccuracy guarantees, QuickDrop reduces the duration of unlearning by 463.8x\ncompared to model retraining from scratch and 65.1x compared to existing FU\napproaches. We also demonstrate the scalability of QuickDrop with 100 clients\nand show its effectiveness while handling multiple unlearning operations.\n","authors":["Akash Dhasade","Yaohong Ding","Song Guo","Anne-marie Kermarrec","Martijn De Vos","Leijie Wu"],"pdf_url":"https://arxiv.org/pdf/2311.15603v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15598v1","updated":"2023-11-27T07:48:50Z","published":"2023-11-27T07:48:50Z","title":"Optimal Clustering of Discrete Mixtures: Binomial, Poisson, Block\n Models, and Multi-layer Networks","summary":" In this paper, we first study the fundamental limit of clustering networks\nwhen a multi-layer network is present. Under the mixture multi-layer stochastic\nblock model (MMSBM), we show that the minimax optimal network clustering error\nrate, which takes an exponential form and is characterized by the Renyi\ndivergence between the edge probability distributions of the component\nnetworks. We propose a novel two-stage network clustering method including a\ntensor-based initialization algorithm involving both node and sample splitting\nand a refinement procedure by likelihood-based Lloyd algorithm. Network\nclustering must be accompanied by node community detection. Our proposed\nalgorithm achieves the minimax optimal network clustering error rate and allows\nextreme network sparsity under MMSBM. Numerical simulations and real data\nexperiments both validate that our method outperforms existing methods.\nOftentimes, the edges of networks carry count-type weights. We then extend our\nmethodology and analysis framework to study the minimax optimal clustering\nerror rate for mixture of discrete distributions including Binomial, Poisson,\nand multi-layer Poisson networks. The minimax optimal clustering error rates in\nthese discrete mixtures all take the same exponential form characterized by the\nRenyi divergences. These optimal clustering error rates in discrete mixtures\ncan also be achieved by our proposed two-stage clustering algorithm.\n","authors":["Zhongyuan Lyu","Ting Li","Dong Xia"],"pdf_url":"https://arxiv.org/pdf/2311.15598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15599v1","updated":"2023-11-27T07:48:50Z","published":"2023-11-27T07:48:50Z","title":"UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio,\n Video, Point Cloud, Time-Series and Image Recognition","summary":" Large-kernel convolutional neural networks (ConvNets) have recently received\nextensive research attention, but there are two unresolved and critical issues\nthat demand further investigation. 1) The architectures of existing\nlarge-kernel ConvNets largely follow the design principles of conventional\nConvNets or transformers, while the architectural design for large-kernel\nConvNets remains under-addressed. 2) As transformers have dominated multiple\nmodalities, it remains to be investigated whether ConvNets also have a strong\nuniversal perception ability in domains beyond vision. In this paper, we\ncontribute from two aspects. 1) We propose four architectural guidelines for\ndesigning large-kernel ConvNets, the core of which is to exploit the essential\ncharacteristics of large kernels that distinguish them from small kernels -\nthey can see wide without going deep. Following such guidelines, our proposed\nlarge-kernel ConvNet shows leading performance in image recognition. For\nexample, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of\n55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher\nspeed than a number of recently proposed powerful competitors. 2) We discover\nthat large kernels are the key to unlocking the exceptional performance of\nConvNets in domains where they were originally not proficient. With certain\nmodality-related preprocessing approaches, the proposed model achieves\nstate-of-the-art performance on time-series forecasting and audio recognition\ntasks even without modality-specific customization to the architecture. Code\nand all the models at https://github.com/AILab-CVC/UniRepLKNet.\n","authors":["Xiaohan Ding","Yiyuan Zhang","Yixiao Ge","Sijie Zhao","Lin Song","Xiangyu Yue","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.15599v1.pdf","comment":"Code, all the models and reproducible training scripts at\n https://github.com/AILab-CVC/UniRepLKNet"},{"id":"http://arxiv.org/abs/2311.12612v3","updated":"2023-11-27T07:41:06Z","published":"2023-11-21T13:54:08Z","title":"A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of\n Continuous Random Variables","summary":" In this paper, I present a completely new type of upper and lower bounds on\nthe right-tail probabilities of continuous random variables with unbounded\nsupport and with semi-bounded support from the left. The presented upper and\nlower right-tail bounds depend only on the probability density function (PDF),\nits first derivative, and two parameters that are used for tightening the\nbounds. These tail bounds hold under certain conditions that depend on the PDF,\nits first and second derivatives, and the two parameters. The new tail bounds\nare shown to be tight for a wide range of continuous random variables via\nnumerical examples.\n","authors":["Nikola Zlatanov"],"pdf_url":"https://arxiv.org/pdf/2311.12612v3.pdf","comment":"Minor typos corrected v2"},{"id":"http://arxiv.org/abs/2310.20587v4","updated":"2023-11-27T07:38:06Z","published":"2023-10-31T16:24:17Z","title":"Unleashing the Power of Pre-trained Language Models for Offline\n Reinforcement Learning","summary":" Offline reinforcement learning (RL) aims to find a near-optimal policy using\npre-collected datasets. In real-world scenarios, data collection could be\ncostly and risky; therefore, offline RL becomes particularly challenging when\nthe in-domain data is limited. Given recent advances in Large Language Models\n(LLMs) and their few-shot learning prowess, this paper introduces\n$\\textbf{La}$nguage Models for $\\textbf{Mo}$tion Control ($\\textbf{LaMo}$), a\ngeneral framework based on Decision Transformers to effectively use pre-trained\nLanguage Models (LMs) for offline RL. Our framework highlights four crucial\ncomponents: (1) Initializing Decision Transformers with sequentially\npre-trained LMs, (2) employing the LoRA fine-tuning method, in contrast to\nfull-weight fine-tuning, to combine the pre-trained knowledge from LMs and\nin-domain knowledge effectively, (3) using the non-linear MLP transformation\ninstead of linear projections, to generate embeddings, and (4) integrating an\nauxiliary language prediction loss during fine-tuning to stabilize the LMs and\nretain their original abilities on languages. Empirical results indicate\n$\\textbf{LaMo}$ achieves state-of-the-art performance in sparse-reward tasks\nand closes the gap between value-based offline RL methods and decision\ntransformers in dense-reward tasks. In particular, our method demonstrates\nsuperior performance in scenarios with limited data samples.\n","authors":["Ruizhe Shi","Yuyao Liu","Yanjie Ze","Simon S. Du","Huazhe Xu"],"pdf_url":"https://arxiv.org/pdf/2310.20587v4.pdf","comment":"24 pages, 16 tables"},{"id":"http://arxiv.org/abs/2311.15587v1","updated":"2023-11-27T07:25:47Z","published":"2023-11-27T07:25:47Z","title":"Quantum Langevin Dynamics for Optimization","summary":" We initiate the study of utilizing Quantum Langevin Dynamics (QLD) to solve\noptimization problems, particularly those non-convex objective functions that\npresent substantial obstacles for traditional gradient descent algorithms.\nSpecifically, we examine the dynamics of a system coupled with an infinite heat\nbath. This interaction induces both random quantum noise and a deterministic\ndamping effect to the system, which nudge the system towards a steady state\nthat hovers near the global minimum of objective functions. We theoretically\nprove the convergence of QLD in convex landscapes, demonstrating that the\naverage energy of the system can approach zero in the low temperature limit\nwith an exponential decay rate correlated with the evolution time. Numerically,\nwe first show the energy dissipation capability of QLD by retracing its origins\nto spontaneous emission. Furthermore, we conduct detailed discussion of the\nimpact of each parameter. Finally, based on the observations when comparing QLD\nwith classical Fokker-Plank-Smoluchowski equation, we propose a time-dependent\nQLD by making temperature and $\\hbar$ time-dependent parameters, which can be\ntheoretically proven to converge better than the time-independent case and also\noutperforms a series of state-of-the-art quantum and classical optimization\nalgorithms in many non-convex landscapes.\n","authors":["Zherui Chen","Yuchen Lu","Hao Wang","Yizhou Liu","Tongyang Li"],"pdf_url":"https://arxiv.org/pdf/2311.15587v1.pdf","comment":"33 pages, 1 table, 26 figures"},{"id":"http://arxiv.org/abs/2311.14412v2","updated":"2023-11-27T07:20:42Z","published":"2023-11-24T11:12:26Z","title":"A Comparison of PDF Projection with Normalizing Flows and SurVAE","summary":" Normalizing flows (NF) recently gained attention as a way to construct\ngenerative networks with exact likelihood calculation out of composable layers.\nHowever, NF is restricted to dimension-preserving transformations. Surjection\nVAE (SurVAE) has been proposed to extend NF to dimension-altering\ntransformations. Such networks are desirable because they are expressive and\ncan be precisely trained. We show that the approaches are a re-invention of PDF\nprojection, which appeared over twenty years earlier and is much further\ndeveloped.\n","authors":["Paul M. Baggenstoss","Felix Govaers"],"pdf_url":"https://arxiv.org/pdf/2311.14412v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15584v1","updated":"2023-11-27T07:19:41Z","published":"2023-11-27T07:19:41Z","title":"A deep learning approach for marine snow synthesis and removal","summary":" Marine snow, the floating particles in underwater images, severely degrades\nthe visibility and performance of human and machine vision systems. This paper\nproposes a novel method to reduce the marine snow interference using deep\nlearning techniques. We first synthesize realistic marine snow samples by\ntraining a Generative Adversarial Network (GAN) model and combine them with\nnatural underwater images to create a paired dataset. We then train a U-Net\nmodel to perform marine snow removal as an image to image translation task. Our\nexperiments show that the U-Net model can effectively remove both synthetic and\nnatural marine snow with high accuracy, outperforming state-of-the-art methods\nsuch as the Median filter and its adaptive variant. We also demonstrate the\nrobustness of our method by testing it on the MSRB dataset, which contains\nsynthetic artifacts that our model has not seen during training. Our method is\na practical and efficient solution for enhancing underwater images affected by\nmarine snow.\n","authors":["Fernando Galetto","Guang Deng"],"pdf_url":"https://arxiv.org/pdf/2311.15584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15583v1","updated":"2023-11-27T07:19:23Z","published":"2023-11-27T07:19:23Z","title":"A Simple Geometric-Aware Indoor Positioning Interpolation Algorithm\n Based on Manifold Learning","summary":" Interpolation methodologies have been widely used within the domain of indoor\npositioning systems. However, existing indoor positioning interpolation\nalgorithms exhibit several inherent limitations, including reliance on complex\nmathematical models, limited flexibility, and relatively low precision. To\nenhance the accuracy and efficiency of indoor positioning interpolation\ntechniques, this paper proposes a simple yet powerful geometric-aware\ninterpolation algorithm for indoor positioning tasks. The key to our algorithm\nis to exploit the geometric attributes of the local topological manifold using\nmanifold learning principles. Therefore, instead of constructing complicated\nmathematical models, the proposed algorithm facilitates the more precise and\nefficient estimation of points grounded in the local topological manifold.\nMoreover, our proposed method can be effortlessly integrated into any indoor\npositioning system, thereby bolstering its adaptability. Through a systematic\narray of experiments and comprehensive performance analyses conducted on both\nsimulated and real-world datasets, we demonstrate that the proposed algorithm\nconsistently outperforms the most commonly used and representative\ninterpolation approaches regarding interpolation accuracy and efficiency.\nFurthermore, the experimental results also underscore the substantial practical\nutility of our method and its potential applicability in real-time indoor\npositioning scenarios.\n","authors":["Suorong Yang","Geng Zhang","Jian Zhao","Furao Shen"],"pdf_url":"https://arxiv.org/pdf/2311.15583v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15582v1","updated":"2023-11-27T07:19:22Z","published":"2023-11-27T07:19:22Z","title":"Lightly Weighted Automatic Audio Parameter Extraction for the Quality\n Assessment of Consensus Auditory-Perceptual Evaluation of Voice","summary":" The Consensus Auditory-Perceptual Evaluation of Voice is a widely employed\ntool in clinical voice quality assessment that is significant for streaming\ncommunication among clinical professionals and benchmarking for the\ndetermination of further treatment. Currently, because the assessment relies on\nexperienced clinicians, it tends to be inconsistent, and thus, difficult to\nstandardize. To address this problem, we propose to leverage lightly weighted\nautomatic audio parameter extraction, to increase the clinical relevance,\nreduce the complexity, and enhance the interpretability of voice quality\nassessment. The proposed method utilizes age, sex, and five audio parameters:\njitter, absolute jitter, shimmer, harmonic-to-noise ratio (HNR), and zero\ncrossing. A classical machine learning approach is employed. The result reveals\nthat our approach performs similar to state-of-the-art (SOTA) methods, and\noutperforms the latent representation obtained by using popular audio\npre-trained models. This approach provide insights into the feasibility of\ndifferent feature extraction approaches for voice evaluation. Audio parameters\nsuch as jitter and the HNR are proven to be suitable for characterizing voice\nquality attributes, such as roughness and strain. Conversely, pre-trained\nmodels exhibit limitations in effectively addressing noise-related scorings.\nThis study contributes toward more comprehensive and precise voice quality\nevaluations, achieved by a comprehensively exploring diverse assessment\nmethodologies.\n","authors":["Yi-Heng Lin","Wen-Hsuan Tseng","Li-Chin Chen","Ching-Ting Tan","Yu Tsao"],"pdf_url":"https://arxiv.org/pdf/2311.15582v1.pdf","comment":"Published in IEEE 42th International Conference on Consumer\n Electronics (ICCE 2024)"},{"id":"http://arxiv.org/abs/2311.15578v1","updated":"2023-11-27T07:11:47Z","published":"2023-11-27T07:11:47Z","title":"Experimental Analysis of Large-scale Learnable Vector Storage\n Compression","summary":" Learnable embedding vector is one of the most important applications in\nmachine learning, and is widely used in various database-related domains.\nHowever, the high dimensionality of sparse data in recommendation tasks and the\nhuge volume of corpus in retrieval-related tasks lead to a large memory\nconsumption of the embedding table, which poses a great challenge to the\ntraining and deployment of models. Recent research has proposed various methods\nto compress the embeddings at the cost of a slight decrease in model quality or\nthe introduction of other overheads. Nevertheless, the relative performance of\nthese methods remains unclear. Existing experimental comparisons only cover a\nsubset of these methods and focus on limited metrics. In this paper, we perform\na comprehensive comparative analysis and experimental evaluation of embedding\ncompression. We introduce a new taxonomy that categorizes these techniques\nbased on their characteristics and methodologies, and further develop a modular\nbenchmarking framework that integrates 14 representative methods. Under a\nuniform test environment, our benchmark fairly evaluates each approach,\npresents their strengths and weaknesses under different memory budgets, and\nrecommends the best method based on the use case. In addition to providing\nuseful guidelines, our study also uncovers the limitations of current methods\nand suggests potential directions for future research.\n","authors":["Hailin Zhang","Penghao Zhao","Xupeng Miao","Yingxia Shao","Zirui Liu","Tong Yang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2311.15578v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11509v2","updated":"2023-11-27T06:53:03Z","published":"2023-11-20T03:17:21Z","title":"Token-Level Adversarial Prompt Detection Based on Perplexity Measures\n and Contextual Information","summary":" In recent years, Large Language Models (LLM) have emerged as pivotal tools in\nvarious applications. However, these models are susceptible to adversarial\nprompt attacks, where attackers can carefully curate input strings that lead to\nundesirable outputs. The inherent vulnerability of LLMs stems from their\ninput-output mechanisms, especially when presented with intensely\nout-of-distribution (OOD) inputs. This paper proposes a token-level detection\nmethod to identify adversarial prompts, leveraging the LLM's capability to\npredict the next token's probability. We measure the degree of the model's\nperplexity and incorporate neighboring token information to encourage the\ndetection of contiguous adversarial prompt sequences. As a result, we propose\ntwo methods: one that identifies each token as either being part of an\nadversarial prompt or not, and another that estimates the probability of each\ntoken being part of an adversarial prompt.\n","authors":["Zhengmian Hu","Gang Wu","Saayan Mitra","Ruiyi Zhang","Tong Sun","Heng Huang","Viswanathan Swaminathan"],"pdf_url":"https://arxiv.org/pdf/2311.11509v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15570v1","updated":"2023-11-27T06:38:07Z","published":"2023-11-27T06:38:07Z","title":"UFDA: Universal Federated Domain Adaptation with Practical Assumptions","summary":" Conventional Federated Domain Adaptation (FDA) approaches usually demand an\nabundance of assumptions, such as label set consistency, which makes them\nsignificantly less feasible for real-world situations and introduces security\nhazards. In this work, we propose a more practical scenario named Universal\nFederated Domain Adaptation (UFDA). It only requires the black-box model and\nthe label set information of each source domain, while the label sets of\ndifferent source domains could be inconsistent and the target-domain label set\nis totally blind. This relaxes the assumptions made by FDA, which are often\nchallenging to meet in real-world cases and diminish model security. To address\nthe UFDA scenario, we propose a corresponding framework called Hot-Learning\nwith Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain\nshifts and category gaps problem by using one-hot outputs from the black-box\nmodels of various source domains. Moreover, to better distinguish the shared\nand unknown classes, we further present a cluster-level strategy named\nMutual-Voting Decision (MVD) to extract robust consensus knowledge across peer\nclasses from both source and target domains. The extensive experiments on three\nbenchmarks demonstrate that our HCLD achieves comparable performance for our\nUFDA scenario with much fewer assumptions, compared to the previous\nmethodologies with many additional assumptions.\n","authors":["Xinhui Liu","Zhenghao Chen","Luping Zhou","Dong Xu","Wei Xi","Gairui Bai","Yihan Zhao","Jizhong Zhao"],"pdf_url":"https://arxiv.org/pdf/2311.15570v1.pdf","comment":"Submitted to AAAI2024"},{"id":"http://arxiv.org/abs/2311.15566v1","updated":"2023-11-27T06:31:17Z","published":"2023-11-27T06:31:17Z","title":"SpotServe: Serving Generative Large Language Models on Preemptible\n Instances","summary":" The high computational and memory requirements of generative large language\nmodels (LLMs) make it challenging to serve them cheaply. This paper aims to\nreduce the monetary cost for serving LLMs by leveraging preemptible GPU\ninstances on modern clouds, which offer accesses to spare GPUs at a much\ncheaper price than regular instances but may be preempted by the cloud at any\ntime. Serving LLMs on preemptible instances requires addressing challenges\ninduced by frequent instance preemptions and the necessity of migrating\ninstances to handle these preemptions.\n This paper presents SpotServe, the first distributed LLM serving system on\npreemptible instances. Several key techniques in SpotServe realize fast and\nreliable serving of generative LLMs on cheap preemptible instances. First,\nSpotServe dynamically adapts the LLM parallelization configuration for dynamic\ninstance availability and fluctuating workload, while balancing the trade-off\namong the overall throughput, inference latency and monetary costs. Second, to\nminimize the cost of migrating instances for dynamic reparallelization, the\ntask of migrating instances is formulated as a bipartite graph matching\nproblem, which uses the Kuhn-Munkres algorithm to identify an optimal migration\nplan that minimizes communications. Finally, to take advantage of the grace\nperiod offered by modern clouds, we introduce stateful inference recovery, a\nnew inference mechanism that commits inference progress at a much finer\ngranularity and allows SpotServe to cheaply resume inference upon preemption.\nWe evaluate on real spot instance preemption traces and various popular LLMs\nand show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared\nwith the best existing LLM serving systems. We also show that SpotServe can\nleverage the price advantage of preemptive instances, saving 54% monetary cost\ncompared with only using on-demand instances.\n","authors":["Xupeng Miao","Chunan Shi","Jiangfei Duan","Xiaoli Xi","Dahua Lin","Bin Cui","Zhihao Jia"],"pdf_url":"https://arxiv.org/pdf/2311.15566v1.pdf","comment":"ASPLOS 2024"},{"id":"http://arxiv.org/abs/2311.15565v1","updated":"2023-11-27T06:26:53Z","published":"2023-11-27T06:26:53Z","title":"Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing\n AI-Generated Text","summary":" My research investigates the use of cutting-edge hybrid deep learning models\nto accurately differentiate between AI-generated text and human writing. I\napplied a robust methodology, utilising a carefully selected dataset comprising\nAI and human texts from various sources, each tagged with instructions.\nAdvanced natural language processing techniques facilitated the analysis of\ntextual features. Combining sophisticated neural networks, the custom model\nenabled it to detect nuanced differences between AI and human content.\n","authors":["Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2311.15565v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00341v2","updated":"2023-11-27T05:51:13Z","published":"2023-11-01T07:21:08Z","title":"The Open DAC 2023 Dataset and Challenges for Sorbent Discovery in Direct\n Air Capture","summary":" New methods for carbon dioxide removal are urgently needed to combat global\nclimate change. Direct air capture (DAC) is an emerging technology to capture\ncarbon dioxide directly from ambient air. Metal-organic frameworks (MOFs) have\nbeen widely studied as potentially customizable adsorbents for DAC. However,\ndiscovering promising MOF sorbents for DAC is challenging because of the vast\nchemical space to explore and the need to understand materials as functions of\nhumidity and temperature. We explore a computational approach benefiting from\nrecent innovations in machine learning (ML) and present a dataset named Open\nDAC 2023 (ODAC23) consisting of more than 38M density functional theory (DFT)\ncalculations on more than 8,400 MOF materials containing adsorbed $CO_2$ and/or\n$H_2O$. ODAC23 is by far the largest dataset of MOF adsorption calculations at\nthe DFT level of accuracy currently available. In addition to probing\nproperties of adsorbed molecules, the dataset is a rich source of information\non structural relaxation of MOFs, which will be useful in many contexts beyond\nspecific applications for DAC. A large number of MOFs with promising properties\nfor DAC are identified directly in ODAC23. We also trained state-of-the-art ML\nmodels on this dataset to approximate calculations at the DFT level. This\nopen-source dataset and our initial ML models will provide an important\nbaseline for future efforts to identify MOFs for a wide range of applications,\nincluding DAC.\n","authors":["Anuroop Sriram","Sihoon Choi","Xiaohan Yu","Logan M. Brabson","Abhishek Das","Zachary Ulissi","Matt Uyttendaele","Andrew J. Medford","David S. Sholl"],"pdf_url":"https://arxiv.org/pdf/2311.00341v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15551v1","updated":"2023-11-27T05:35:49Z","published":"2023-11-27T05:35:49Z","title":"Instruct2Attack: Language-Guided Semantic Adversarial Attacks","summary":" We propose Instruct2Attack (I2A), a language-guided semantic attack that\ngenerates semantically meaningful perturbations according to free-form language\ninstructions. We make use of state-of-the-art latent diffusion models, where we\nadversarially guide the reverse diffusion process to search for an adversarial\nlatent code conditioned on the input image and text instruction. Compared to\nexisting noise-based and semantic attacks, I2A generates more natural and\ndiverse adversarial examples while providing better controllability and\ninterpretability. We further automate the attack process with GPT-4 to generate\ndiverse image-specific text instructions. We show that I2A can successfully\nbreak state-of-the-art deep neural networks even under strong adversarial\ndefenses, and demonstrate great transferability among a variety of network\narchitectures.\n","authors":["Jiang Liu","Chen Wei","Yuxiang Guo","Heng Yu","Alan Yuille","Soheil Feizi","Chun Pong Lau","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2311.15551v1.pdf","comment":"under submission, code coming soon"},{"id":"http://arxiv.org/abs/2311.15549v1","updated":"2023-11-27T05:29:43Z","published":"2023-11-27T05:29:43Z","title":"From Prediction to Action: The Critical Role of Proper Performance\n Estimation for Machine-Learning-Driven Materials Discovery","summary":" Materials discovery driven by statistical property models is an iterative\ndecision process, during which an initial data collection is extended with new\ndata proposed by a model-informed acquisition function--with the goal to\nmaximize a certain \"reward\" over time, such as the maximum property value\ndiscovered so far. While the materials science community achieved much progress\nin developing property models that predict well on average with respect to the\ntraining distribution, this form of in-distribution performance measurement is\nnot directly coupled with the discovery reward. This is because an iterative\ndiscovery process has a shifting reward distribution that is\nover-proportionally determined by the model performance for exceptional\nmaterials. We demonstrate this problem using the example of bulk modulus\nmaximization among double perovskite oxides. We find that the in-distribution\npredictive performance suggests random forests as superior to Gaussian process\nregression, while the results are inverse in terms of the discovery rewards. We\nargue that the lack of proper performance estimation methods from pre-computed\ndata collections is a fundamental problem for improving data-driven materials\ndiscovery, and we propose a novel such estimator that, in contrast to na\\\"ive\nreward estimation, successfully predicts Gaussian processes with the \"expected\nimprovement\" acquisition function as the best out of four options in our\ndemonstrational study for double perovskites. Importantly, it does so without\nrequiring the over thousand ab initio computations that were needed to confirm\nthis prediction.\n","authors":["Mario Boley","Felix Luong","Simon Teshuva","Daniel F Schmidt","Lucas Foppa","Matthias Scheffler"],"pdf_url":"https://arxiv.org/pdf/2311.15549v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15548v1","updated":"2023-11-27T05:27:13Z","published":"2023-11-27T05:27:13Z","title":"Deficiency of Large Language Models in Finance: An Empirical Examination\n of Hallucination","summary":" The hallucination issue is recognized as a fundamental deficiency of large\nlanguage models (LLMs), especially when applied to fields such as finance,\neducation, and law. Despite the growing concerns, there has been a lack of\nempirical investigation. In this paper, we provide an empirical examination of\nLLMs' hallucination behaviors in financial tasks. First, we empirically\ninvestigate LLM model's ability of explaining financial concepts and\nterminologies. Second, we assess LLM models' capacity of querying historical\nstock prices. Third, to alleviate the hallucination issue, we evaluate the\nefficacy of four practical methods, including few-shot learning, Decoding by\nContrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method\nand the prompt-based tool learning method for a function to generate a query\ncommand. Finally, our major finding is that off-the-shelf LLMs experience\nserious hallucination behaviors in financial tasks. Therefore, there is an\nurgent need to call for research efforts in mitigating LLMs' hallucination.\n","authors":["Haoqiang Kang","Xiao-Yang Liu"],"pdf_url":"https://arxiv.org/pdf/2311.15548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15547v1","updated":"2023-11-27T05:23:01Z","published":"2023-11-27T05:23:01Z","title":"Dataset Distillation in Latent Space","summary":" Dataset distillation (DD) is a newly emerging research area aiming at\nalleviating the heavy computational load in training models on large datasets.\nIt tries to distill a large dataset into a small and condensed one so that\nmodels trained on the distilled dataset can perform comparably with those\ntrained on the full dataset when performing downstream tasks. Among the\nprevious works in this area, there are three key problems that hinder the\nperformance and availability of the existing DD methods: high time complexity,\nhigh space complexity, and low info-compactness. In this work, we\nsimultaneously attempt to settle these three problems by moving the DD\nprocesses from conventionally used pixel space to latent space. Encoded by a\npretrained generic autoencoder, latent codes in the latent space are naturally\ninfo-compact representations of the original images in much smaller sizes.\nAfter transferring three mainstream DD algorithms to latent space, we\nsignificantly reduce time and space consumption while achieving similar\nperformance, allowing us to distill high-resolution datasets or target at\ngreater data ratio that previous methods have failed. Besides, within the same\nstorage budget, we can also quantitatively deliver more latent codes than\npixel-level images, which further boosts the performance of our methods.\n","authors":["Yuxuan Duan","Jianfu Zhang","Liqing Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15547v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2311.15545v1","updated":"2023-11-27T05:21:08Z","published":"2023-11-27T05:21:08Z","title":"Out-of-Distribution Generalized Dynamic Graph Neural Network for Human\n Albumin Prediction","summary":" Human albumin is essential for indicating the body's overall health.\nAccurately predicting plasma albumin levels and determining appropriate doses\nare urgent clinical challenges, particularly in critically ill patients, to\nmaintain optimal blood levels. However, human albumin prediction is non-trivial\nthat has to leverage the dynamics of biochemical markers as well as the\nexperience of treating patients. Moreover, the problem of distribution shift is\noften encountered in real clinical data, which may lead to a decline in the\nmodel prediction performance and reduce the reliability of the model's\napplication. In this paper, we propose a framework named Out-of-Distribution\nGeneralized Dynamic Graph Neural Network for Human Albumin Prediction\n(DyG-HAP), which is able to provide accurate albumin predictions for Intensity\nCare Unit (ICU) patients during hospitalization. We first model human albumin\nprediction as a dynamic graph regression problem to model the dynamics and\npatient relationship. Then, we propose a disentangled dynamic graph attention\nmechanism to capture and disentangle the patterns whose relationship to labels\nunder distribution shifts is invariant and variant respectively. Last, we\npropose an invariant dynamic graph regression method to encourage the model to\nrely on invariant patterns to make predictions. Moreover, we propose a dataset\nnamed Albumin level testing and nutritional dosing data for Intensive Care\n(ANIC) for evaluation. Extensive experiments demonstrate the superiority of our\nmethod compared to several baseline methods in human albumin prediction.\n","authors":["Zeyang Zhang","Xingwang Li","Fei Teng","Ning Lin","Xueling Zhu","Xin Wang","Wenwu Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.15545v1.pdf","comment":"MedAI'23"},{"id":"http://arxiv.org/abs/2309.01947v2","updated":"2023-11-27T05:03:31Z","published":"2023-09-05T04:47:55Z","title":"TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression\n For On-device ASR Models","summary":" Automatic Speech Recognition (ASR) models need to be optimized for specific\nhardware before they can be deployed on devices. This can be done by tuning the\nmodel's hyperparameters or exploring variations in its architecture.\nRe-training and re-validating models after making these changes can be a\nresource-intensive task. This paper presents TODM (Train Once Deploy Many), a\nnew approach to efficiently train many sizes of hardware-friendly on-device ASR\nmodels with comparable GPU-hours to that of a single training job. TODM\nleverages insights from prior work on Supernet, where Recurrent Neural Network\nTransducer (RNN-T) models share weights within a Supernet. It reduces layer\nsizes and widths of the Supernet to obtain subnetworks, making them smaller\nmodels suitable for all hardware types. We introduce a novel combination of\nthree techniques to improve the outcomes of the TODM Supernet: adaptive\ndropouts, an in-place Alpha-divergence knowledge distillation, and the use of\nScaledAdam optimizer. We validate our approach by comparing Supernet-trained\nversus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using\nLibriSpeech. Results demonstrate that our TODM Supernet either matches or\nsurpasses the performance of manually tuned models by up to a relative of 3%\nbetter in word error rate (WER), while efficiently keeping the cost of training\nmany models at a small constant.\n","authors":["Yuan Shangguan","Haichuan Yang","Danni Li","Chunyang Wu","Yassir Fathullah","Dilin Wang","Ayushi Dalmia","Raghuraman Krishnamoorthi","Ozlem Kalinli","Junteng Jia","Jay Mahadeokar","Xin Lei","Mike Seltzer","Vikas Chandra"],"pdf_url":"https://arxiv.org/pdf/2309.01947v2.pdf","comment":"Meta AI; Submitted to ICASSP 2024"},{"id":"http://arxiv.org/abs/2308.16781v4","updated":"2023-11-27T05:03:14Z","published":"2023-08-31T14:59:32Z","title":"StratMed: Relevance Stratification between Biomedical Entities for\n Sparsity on Medication Recommendation","summary":" With the growing imbalance between limited medical resources and escalating\ndemands, AI-based clinical tasks have become paramount. As a sub-domain,\nmedication recommendation aims to amalgamate longitudinal patient history with\nmedical knowledge, assisting physicians in prescribing safer and more accurate\nmedication combinations. Existing works ignore the inherent long-tailed\ndistribution of medical data, have uneven learning strengths for hot and sparse\ndata, and fail to balance safety and accuracy. To address the above\nlimitations, we propose StratMed, which introduces a stratification strategy\nthat overcomes the long-tailed problem and achieves fuller learning of sparse\ndata. It also utilizes a dual-property network to address the issue of mutual\nconstraints on the safety and accuracy of medication combinations,\nsynergistically enhancing these two properties. Specifically, we construct a\npre-training method using deep learning networks to obtain medication and\ndisease representations. After that, we design a pyramid-like stratification\nmethod based on relevance to strengthen the expressiveness of sparse data.\nBased on this relevance, we design two graph structures to express medication\nsafety and precision at the same level to obtain patient representations.\nFinally, the patient's historical clinical information is fitted to generate\nmedication combinations for the current health condition. We employed the\nMIMIC-III dataset to evaluate our model against state-of-the-art methods in\nthree aspects comprehensively. Compared to the sub-optimal baseline model, our\nmodel reduces safety risk by 15.08\\%, improves accuracy by 0.36\\%, and reduces\ntraining time consumption by 81.66\\%.\n","authors":["Xiang Li","Shunpan Liang","Yulei Hou","Tengfei Ma"],"pdf_url":"https://arxiv.org/pdf/2308.16781v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15536v1","updated":"2023-11-27T04:49:24Z","published":"2023-11-27T04:49:24Z","title":"SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume\n Registration","summary":" Background and Objective: The lack of benchmark datasets has impeded the\ndevelopment of slice-to-volume registration algorithms. Such datasets are\ndifficult to annotate, primarily due to the dimensional difference within data\nand the dearth of task-specific software. We aim to develop a user-friendly\ntool to streamline dataset annotation for slice-to-volume registration.\n Methods: The proposed tool, named SVRDA, is an installation-free web\napplication for platform-agnostic collaborative dataset annotation. It enables\nefficient transformation manipulation via keyboard shortcuts and smooth case\ntransitions with auto-saving. SVRDA supports configuration-based data loading\nand adheres to the separation of concerns, offering great flexibility and\nextensibility for future research. Various supplementary features have been\nimplemented to facilitate slice-to-volume registration.\n Results: We validated the effectiveness of SVRDA by indirectly evaluating the\npost-registration segmentation quality on UK Biobank data, observing a dramatic\noverall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in\nthe 95th percentile Hausdorff distance, respectively) supported by highly\nstatistically significant evidence ($p<0.001$).We further showcased the\nclinical usage of SVRDA by integrating it into test-retest T1 quantification on\nin-house magnetic resonance images, leading to more consistent results after\nregistration.\n Conclusions: SVRDA can facilitate collaborative annotation of benchmark\ndatasets while being potentially applicable to other pipelines incorporating\nslice-to-volume registration. Full source code and documentation are available\nat https://github.com/Roldbach/SVRDA\n","authors":["Weixun Luo","Alexandre Triay Bagur","Paul Aljabar","George Ralli","Sir Michael Brady"],"pdf_url":"https://arxiv.org/pdf/2311.15536v1.pdf","comment":"18 pages, 11 figures, In submission to Computer Methods and Programs\n in Biomedicine"},{"id":"http://arxiv.org/abs/2205.13748v2","updated":"2023-11-27T04:41:51Z","published":"2022-05-27T03:24:31Z","title":"Auto-PINN: Understanding and Optimizing Physics-Informed Neural\n Architecture","summary":" Physics-informed neural networks (PINNs) are revolutionizing science and\nengineering practice by bringing together the power of deep learning to bear on\nscientific computation. In forward modeling problems, PINNs are meshless\npartial differential equation (PDE) solvers that can handle irregular,\nhigh-dimensional physical domains. Naturally, the neural architecture\nhyperparameters have a large impact on the efficiency and accuracy of the PINN\nsolver. However, this remains an open and challenging problem because of the\nlarge search space and the difficulty of identifying a proper search objective\nfor PDEs. Here, we propose Auto-PINN, the first systematic, automated\nhyperparameter optimization approach for PINNs, which employs Neural\nArchitecture Search (NAS) techniques to PINN design. Auto-PINN avoids manually\nor exhaustively searching the hyperparameter space associated with PINNs. A\ncomprehensive set of pre-experiments using standard PDE benchmarks allows us to\nprobe the structure-performance relationship in PINNs. We find that the\ndifferent hyperparameters can be decoupled, and that the training loss function\nof PINNs is a good search objective. Comparison experiments with baseline\nmethods demonstrate that Auto-PINN produces neural architectures with superior\nstability and accuracy over alternative baselines.\n","authors":["Yicheng Wang","Xiaotian Han","Chia-Yuan Chang","Daochen Zha","Ulisses Braga-Neto","Xia Hu"],"pdf_url":"https://arxiv.org/pdf/2205.13748v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15530v1","updated":"2023-11-27T04:23:47Z","published":"2023-11-27T04:23:47Z","title":"SSIN: Self-Supervised Learning for Rainfall Spatial Interpolation","summary":" The acquisition of accurate rainfall distribution in space is an important\ntask in hydrological analysis and natural disaster pre-warning. However, it is\nimpossible to install rain gauges on every corner. Spatial interpolation is a\ncommon way to infer rainfall distribution based on available raingauge data.\nHowever, the existing works rely on some unrealistic pre-settings to capture\nspatial correlations, which limits their performance in real scenarios. To\ntackle this issue, we propose the SSIN, which is a novel data-driven\nself-supervised learning framework for rainfall spatial interpolation by mining\nlatent spatial patterns from historical observation data. Inspired by the Cloze\ntask and BERT, we fully consider the characteristics of spatial interpolation\nand design the SpaFormer model based on the Transformer architecture as the\ncore of SSIN. Our main idea is: by constructing rich self-supervision signals\nvia random masking, SpaFormer can learn informative embeddings for raw data and\nthen adaptively model spatial correlations based on rainfall spatial context.\nExtensive experiments on two real-world raingauge datasets show that our method\noutperforms the state-of-the-art solutions. In addition, we take traffic\nspatial interpolation as another use case to further explore the performance of\nour method, and SpaFormer achieves the best performance on one large real-world\ntraffic dataset, which further confirms the effectiveness and generality of our\nmethod.\n","authors":["Jia Li","Yanyan Shen","Lei Chen","Charles Wang Wai NG"],"pdf_url":"https://arxiv.org/pdf/2311.15530v1.pdf","comment":"SIGMOD 2023 Data-intensive Applications (DIA) Track; Code is\n available at https://github.com/jlidw/SSIN"},{"id":"http://arxiv.org/abs/2308.12532v2","updated":"2023-11-27T03:33:37Z","published":"2023-08-24T03:43:02Z","title":"FedSoL: Bridging Global Alignment and Local Generality in Federated\n Learning","summary":" Federated Learning (FL) aggregates locally trained models from individual\nclients to construct a global model. While FL enables learning a model with\ndata privacy, it often suffers from significant performance degradation when\nclient data distributions are heterogeneous. Many previous FL algorithms have\naddressed this issue by introducing various proximal restrictions. These\nrestrictions aim to encourage global alignment by constraining the deviation of\nlocal learning from the global objective. However, they inherently limit local\nlearning by interfering with the original local objectives. Recently, an\nalternative approach has emerged to improve local learning generality. By\nobtaining local models within a smooth loss landscape, this approach mitigates\nconflicts among different local objectives of the clients. Yet, it does not\nensure stable global alignment, as local learning does not take the global\nobjective into account. In this study, we propose Federated Stability on\nLearning (FedSoL), which combines both the concepts of global alignment and\nlocal generality. In FedSoL, the local learning seeks a parameter region robust\nagainst proximal perturbations. This strategy introduces an implicit proximal\nrestriction effect in local learning while maintaining the original local\nobjective for parameter update. Our experiments show that FedSoL consistently\nachieves state-of-the-art performance on various setups.\n","authors":["Gihun Lee","Minchan Jeong","Sangmook Kim","Jaehoon Oh","Se-Young Yun"],"pdf_url":"https://arxiv.org/pdf/2308.12532v2.pdf","comment":"19 pages, 12 figures"},{"id":"http://arxiv.org/abs/2311.15516v1","updated":"2023-11-27T03:25:12Z","published":"2023-11-27T03:25:12Z","title":"Active Foundational Models for Fault Diagnosis of Electrical Motors","summary":" Fault detection and diagnosis of electrical motors are of utmost importance\nin ensuring the safe and reliable operation of several industrial systems.\nDetection and diagnosis of faults at the incipient stage allows corrective\nactions to be taken in order to reduce the severity of faults. The existing\ndata-driven deep learning approaches for machine fault diagnosis rely\nextensively on huge amounts of labeled samples, where annotations are expensive\nand time-consuming. However, a major portion of unlabeled condition monitoring\ndata is not exploited in the training process. To overcome this limitation, we\npropose a foundational model-based Active Learning framework that utilizes less\namount of labeled samples, which are most informative and harnesses a large\namount of available unlabeled data by effectively combining Active Learning and\nContrastive Self-Supervised Learning techniques. It consists of a transformer\nnetwork-based backbone model trained using an advanced nearest-neighbor\ncontrastive self-supervised learning method. This approach empowers the\nbackbone to learn improved representations of samples derived from raw,\nunlabeled vibration data. Subsequently, the backbone can undergo fine-tuning to\naddress a range of downstream tasks, both within the same machines and across\ndifferent machines. The effectiveness of the proposed methodology has been\nassessed through the fine-tuning of the backbone for multiple target tasks\nusing three distinct machine-bearing fault datasets. The experimental\nevaluation demonstrates a superior performance as compared to existing\nstate-of-the-art fault diagnosis methods with less amount of labeled data.\n","authors":["Sriram Anbalagan","Sai Shashank GP","Deepesh Agarwal","Balasubramaniam Natarajan","Babji Srinivasan"],"pdf_url":"https://arxiv.org/pdf/2311.15516v1.pdf","comment":"30 pages, 2 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.15513v1","updated":"2023-11-27T03:17:09Z","published":"2023-11-27T03:17:09Z","title":"A Comparative and Experimental Study on Automatic Question Answering\n Systems and its Robustness against Word Jumbling","summary":" Question answer generation using Natural Language Processing models is\nubiquitous in the world around us. It is used in many use cases such as the\nbuilding of chat bots, suggestive prompts in google search and also as a way of\nnavigating information in banking mobile applications etc. It is highly\nrelevant because a frequently asked questions (FAQ) list can only have a finite\namount of questions but a model which can perform question answer generation\ncould be able to answer completely new questions that are within the scope of\nthe data. This helps us to be able to answer new questions accurately as long\nas it is a relevant question. In commercial applications, it can be used to\nincrease customer satisfaction and ease of usage. However a lot of data is\ngenerated by humans so it is susceptible to human error and this can adversely\naffect the model's performance and we are investigating this through our work\n","authors":["Shashidhar Reddy Javaji","Haoran Hu","Sai Sameer Vennam","Vijaya Gajanan Buddhavarapu"],"pdf_url":"https://arxiv.org/pdf/2311.15513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.02858v3","updated":"2023-11-27T03:15:34Z","published":"2023-04-06T04:37:10Z","title":"A review of ensemble learning and data augmentation models for class\n imbalanced problems: combination, implementation and evaluation","summary":" Class imbalance (CI) in classification problems arises when the number of\nobservations belonging to one class is lower than the other. Ensemble learning\ncombines multiple models to obtain a robust model and has been prominently used\nwith data augmentation methods to address class imbalance problems. In the last\ndecade, a number of strategies have been added to enhance ensemble learning and\ndata augmentation methods, along with new methods such as generative\nadversarial networks (GANs). A combination of these has been applied in many\nstudies, and the evaluation of different combinations would enable a better\nunderstanding and guidance for different application domains. In this paper, we\npresent a computational study to evaluate data augmentation and ensemble\nlearning methods used to address prominent benchmark CI problems. We present a\ngeneral framework that evaluates 9 data augmentation and 9 ensemble learning\nmethods for CI problems. Our objective is to identify the most effective\ncombination for improving classification performance on imbalanced datasets.\nThe results indicate that combinations of data augmentation methods with\nensemble learning can significantly improve classification performance on\nimbalanced datasets. We find that traditional data augmentation methods such as\nthe synthetic minority oversampling technique (SMOTE) and random oversampling\n(ROS) are not only better in performance for selected CI problems, but also\ncomputationally less expensive than GANs. Our study is vital for the\ndevelopment of novel models for handling imbalanced datasets.\n","authors":["Azal Ahmad Khan","Omkar Chaudhari","Rohitash Chandra"],"pdf_url":"https://arxiv.org/pdf/2304.02858v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09740v3","updated":"2023-11-27T03:09:21Z","published":"2023-11-16T10:13:09Z","title":"Redefining Super-Resolution: Fine-mesh PDE predictions without classical\n simulations","summary":" In Computational Fluid Dynamics (CFD), coarse mesh simulations offer\ncomputational efficiency but often lack precision. Applying conventional\nsuper-resolution to these simulations poses a significant challenge due to the\nfundamental contrast between downsampling high-resolution images and\nauthentically emulating low-resolution physics. The former method conserves\nmore of the underlying physics, surpassing the usual constraints of real-world\nscenarios. We propose a novel definition of super-resolution tailored for\nPDE-based problems. Instead of simply downsampling from a high-resolution\ndataset, we use coarse-grid simulated data as our input and predict fine-grid\nsimulated outcomes. Employing a physics-infused UNet upscaling method, we\ndemonstrate its efficacy across various 2D-CFD problems such as discontinuity\ndetection in Burger's equation, Methane combustion, and fouling in Industrial\nheat exchangers. Our method enables the generation of fine-mesh solutions\nbypassing traditional simulation, ensuring considerable computational saving\nand fidelity to the original ground truth outcomes. Through diverse boundary\nconditions during training, we further establish the robustness of our method,\npaving the way for its broad applications in engineering and scientific CFD\nsolvers.\n","authors":["Rajat Kumar Sarkar","Ritam Majumdar","Vishal Jadhav","Sagar Srinivas Sakhinana","Venkataramana Runkana"],"pdf_url":"https://arxiv.org/pdf/2311.09740v3.pdf","comment":"Accepted at Machine Learning and the Physical Sciences Workshop,\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2211.04686v3","updated":"2023-11-27T03:07:32Z","published":"2022-11-09T05:18:08Z","title":"Directional Privacy for Deep Learning","summary":" Differentially Private Stochastic Gradient Descent (DP-SGD) is a key method\nfor applying privacy in the training of deep learning models. It applies\nisotropic Gaussian noise to gradients during training, which can perturb these\ngradients in any direction, damaging utility. Metric DP, however, can provide\nalternative mechanisms based on arbitrary metrics that might be more suitable\nfor preserving utility. In this paper, we apply \\textit{directional privacy},\nvia a mechanism based on the von Mises-Fisher (VMF) distribution, to perturb\ngradients in terms of \\textit{angular distance} so that gradient direction is\nbroadly preserved. We show that this provides both $\\epsilon$-DP and $\\epsilon\nd$-privacy for deep learning training, rather than the $(\\epsilon,\n\\delta)$-privacy of the Gaussian mechanism. Experiments on key datasets then\nindicate that the VMF mechanism can outperform the Gaussian in the\nutility-privacy trade-off. In particular, our experiments provide a direct\nempirical comparison of privacy between the two approaches in terms of their\nability to defend against reconstruction and membership inference.\n","authors":["Pedro Faustini","Natasha Fernandes","Shakila Tonni","Annabelle McIver","Mark Dras"],"pdf_url":"https://arxiv.org/pdf/2211.04686v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15502v1","updated":"2023-11-27T02:59:17Z","published":"2023-11-27T02:59:17Z","title":"Learning with Complementary Labels Revisited: A Consistent Approach via\n Negative-Unlabeled Learning","summary":" Complementary-label learning is a weakly supervised learning problem in which\neach training example is associated with one or multiple complementary labels\nindicating the classes to which it does not belong. Existing consistent\napproaches have relied on the uniform distribution assumption to model the\ngeneration of complementary labels, or on an ordinary-label training set to\nestimate the transition matrix. However, both conditions may not be satisfied\nin real-world scenarios. In this paper, we propose a novel complementary-label\nlearning approach that does not rely on these conditions. We find that\ncomplementary-label learning can be expressed as a set of negative-unlabeled\nbinary classification problems when using the one-versus-rest strategy. This\nobservation allows us to propose a risk-consistent approach with theoretical\nguarantees. Furthermore, we introduce a risk correction approach to address\noverfitting problems when using complex models. We also prove the statistical\nconsistency and convergence rate of the corrected risk estimator. Extensive\nexperimental results on both synthetic and real-world benchmark datasets\nvalidate the superiority of our proposed approach over state-of-the-art\nmethods.\n","authors":["Wei Wang","Takashi Ishida","Yu-Jie Zhang","Gang Niu","Masashi Sugiyama"],"pdf_url":"https://arxiv.org/pdf/2311.15502v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15500v1","updated":"2023-11-27T02:55:34Z","published":"2023-11-27T02:55:34Z","title":"Function-constrained Program Synthesis","summary":" This work introduces (1) a technique that allows large language models (LLMs)\nto leverage user-provided code when solving programming tasks and (2) a method\nto iteratively generate modular sub-functions that can aid future code\ngeneration attempts when the initial code generated by the LLM is inadequate.\nGenerating computer programs in general-purpose programming languages like\nPython poses a challenge for LLMs when instructed to use code provided in the\nprompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code\ncompletions in real-time by drawing on all code available in a development\nenvironment. However, restricting code-specific LLMs to use only in-context\ncode is not straightforward, as the model is not explicitly instructed to use\nthe user-provided code and users cannot highlight precisely which snippets of\ncode the model should incorporate into its context. Moreover, current systems\nlack effective recovery methods, forcing users to iteratively re-prompt the\nmodel with modified prompts until a sufficient solution is reached. Our method\ndiffers from traditional LLM-powered code-generation by constraining\ncode-generation to an explicit function set and enabling recovery from failed\nattempts through automatically generated sub-functions. When the LLM cannot\nproduce working code, we generate modular sub-functions to aid subsequent\nattempts at generating functional code. A by-product of our method is a library\nof reusable sub-functions that can solve related tasks, imitating a software\nteam where efficiency scales with experience. We also introduce a new\n\"half-shot\" evaluation paradigm that provides tighter estimates of LLMs' coding\nabilities compared to traditional zero-shot evaluation. Our proposed evaluation\nmethod encourages models to output solutions in a structured format, decreasing\nsyntax errors that can be mistaken for poor coding ability.\n","authors":["Patrick Hajali","Ignas Budvytis"],"pdf_url":"https://arxiv.org/pdf/2311.15500v1.pdf","comment":"17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop"},{"id":"http://arxiv.org/abs/2311.15497v1","updated":"2023-11-27T02:48:06Z","published":"2023-11-27T02:48:06Z","title":"Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning\n and Optimization Functions for Enhanced Precision","summary":" Image registration has traditionally been done using two distinct approaches:\nlearning based methods, relying on robust deep neural networks, and\noptimization-based methods, applying complex mathematical transformations to\nwarp images accordingly. Of course, both paradigms offer advantages and\ndisadvantages, and, in this work, we seek to combine their respective strengths\ninto a single streamlined framework, using the outputs of the learning based\nmethod as initial parameters for optimization while prioritizing computational\npower for the image pairs that offer the greatest loss. Our investigations\nshowed that an improvement of 0.3\\% in testing when utilizing the best\nperforming state-of-the-art model as the backbone of the framework, while\nmaintaining the same inference time and with only a 0.8\\% loss in deformation\nfield smoothness.\n","authors":["Gabriel De Araujo","Shanlin Sun","Xiaohui Xie"],"pdf_url":"https://arxiv.org/pdf/2311.15497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15487v1","updated":"2023-11-27T02:12:02Z","published":"2023-11-27T02:12:02Z","title":"Global $\\mathcal{L}^2$ minimization with certainty via geometrically\n adapted gradient descent in Deep Learning","summary":" We consider the gradient descent flow widely used for the minimization of the\n$\\mathcal{L}^2$ cost function in Deep Learning networks, and introduce two\nmodified versions; one adapted for the overparametrized setting, and the other\nfor the underparametrized setting. Both have a clear and natural invariant\ngeometric meaning, taking into account the pullback vector bundle structure in\nthe overparametrized, and the pushforward vector bundle structure in the\nunderparametrized setting. In the overparametrized case, we prove that,\nprovided that a rank condition holds, all orbits of the modified gradient\ndescent drive the $\\mathcal{L}^2$ cost to its global minimum at a uniform\nexponential convergence rate. We point out relations of the latter to\nsub-Riemannian geometry.\n","authors":["Thomas Chen"],"pdf_url":"https://arxiv.org/pdf/2311.15487v1.pdf","comment":"AMS Latex, 12 pages"}],"Multimedia":[{"id":"http://arxiv.org/abs/2304.02970v4","updated":"2023-11-27T13:11:20Z","published":"2023-04-06T09:54:06Z","title":"A Closer Look at Audio-Visual Segmentation","summary":" Audio-visual segmentation (AVS) is a complex task that involves accurately\nsegmenting the corresponding sounding object based on audio-visual queries.\nSuccessful audio-visual learning requires two essential components: 1) an\nunbiased dataset with high-quality pixel-level multi-class labels, and 2) a\nmodel capable of effectively linking audio information with its corresponding\nvisual object. However, these two requirements are only partially addressed by\ncurrent methods, with training sets containing biased audio-visual data, and\nmodels that generalise poorly beyond this biased training set. In this work, we\npropose a new strategy to build cost-effective and relatively unbiased\naudio-visual semantic segmentation benchmarks. Our strategy, called Visual\nPost-production (VPO), explores the observation that it is not necessary to\nhave explicit audio-visual pairs extracted from single video sources to build\nsuch benchmarks. We also refine the previously proposed AVSBench to transform\nit into the audio-visual semantic segmentation benchmark AVSBench-Single+.\nFurthermore, this paper introduces a new pixel-wise audio-visual contrastive\nlearning method to enable a better generalisation of the model beyond the\ntraining set. We verify the validity of the VPO strategy by showing that\nstate-of-the-art (SOTA) models trained with datasets built by matching audio\nand visual data from different sources or with datasets containing audio and\nvisual data from the same video source produce almost the same accuracy. Then,\nusing the proposed VPO benchmarks and AVSBench-Single+, we show that our method\nproduces more accurate audio-visual semantic segmentation than SOTA models.\nCode and dataset will be available.\n","authors":["Yuanhong Chen","Yuyuan Liu","Hu Wang","Fengbei Liu","Chong Wang","Gustavo Carneiro"],"pdf_url":"https://arxiv.org/pdf/2304.02970v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12454v2","updated":"2023-11-27T12:26:32Z","published":"2023-11-21T09:07:11Z","title":"HierSpeech++: Bridging the Gap between Semantic and Acoustic\n Representation of Speech by Hierarchical Variational Inference for Zero-shot\n Speech Synthesis","summary":" Large language models (LLM)-based speech synthesis has been widely adopted in\nzero-shot speech synthesis. However, they require a large-scale data and\npossess the same limitations as previous autoregressive speech models,\nincluding slow inference speed and lack of robustness. This paper proposes\nHierSpeech++, a fast and strong zero-shot speech synthesizer for text-to-speech\n(TTS) and voice conversion (VC). We verified that hierarchical speech synthesis\nframeworks could significantly improve the robustness and expressiveness of the\nsynthetic speech. Furthermore, we significantly improve the naturalness and\nspeaker similarity of synthetic speech even in zero-shot speech synthesis\nscenarios. For text-to-speech, we adopt the text-to-vec framework, which\ngenerates a self-supervised speech representation and an F0 representation\nbased on text representations and prosody prompts. Then, HierSpeech++ generates\nspeech from the generated vector, F0, and voice prompt. We further introduce a\nhigh-efficient speech super-resolution framework from 16 kHz to 48 kHz. The\nexperimental results demonstrated that the hierarchical variational autoencoder\ncould be a strong zero-shot speech synthesizer given that it outperforms\nLLM-based and diffusion-based models. Moreover, we achieved the first\nhuman-level quality zero-shot speech synthesis. Audio samples and source code\nare available at https://github.com/sh-lee-prml/HierSpeechpp.\n","authors":["Sang-Hoon Lee","Ha-Yeong Choi","Seung-Bin Kim","Seong-Whan Lee"],"pdf_url":"https://arxiv.org/pdf/2311.12454v2.pdf","comment":"16 pages, 9 figures, 12 tables"},{"id":"http://arxiv.org/abs/2309.07983v2","updated":"2023-11-27T11:54:56Z","published":"2023-09-14T18:40:28Z","title":"SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker\n Recognition Systems","summary":" Membership inference attacks allow adversaries to determine whether a\nparticular example was contained in the model's training dataset. While\nprevious works have confirmed the feasibility of such attacks in various\napplications, none has focused on speaker recognition (SR), a promising\nvoice-based biometric recognition technique. In this work, we propose SLMIA-SR,\nthe first membership inference attack tailored to SR. In contrast to\nconventional example-level attack, our attack features speaker-level membership\ninference, i.e., determining if any voices of a given speaker, either the same\nas or different from the given inference voices, have been involved in the\ntraining of a model. It is particularly useful and practical since the training\nand inference voices are usually distinct, and it is also meaningful\nconsidering the open-set nature of SR, namely, the recognition speakers were\noften not present in the training data. We utilize intra-similarity and\ninter-dissimilarity, two training objectives of SR, to characterize the\ndifferences between training and non-training speakers and quantify them with\ntwo groups of features driven by carefully-established feature engineering to\nmount the attack. To improve the generalizability of our attack, we propose a\nnovel mixing ratio training strategy to train attack models. To enhance the\nattack performance, we introduce voice chunk splitting to cope with the limited\nnumber of inference voices and propose to train attack models dependent on the\nnumber of inference voices. Our attack is versatile and can work in both\nwhite-box and black-box scenarios. Additionally, we propose two novel\ntechniques to reduce the number of black-box queries while maintaining the\nattack performance. Extensive experiments demonstrate the effectiveness of\nSLMIA-SR.\n","authors":["Guangke Chen","Yedi Zhang","Fu Song"],"pdf_url":"https://arxiv.org/pdf/2309.07983v2.pdf","comment":"In Proceedings of the 31st Network and Distributed System Security\n (NDSS) Symposium, 2024"},{"id":"http://arxiv.org/abs/2311.13770v2","updated":"2023-11-27T11:09:47Z","published":"2023-11-23T01:53:02Z","title":"Archiving Body Movements: Collective Generation of Chinese Calligraphy","summary":" As a communication channel, body movements have been widely explored in\nbehavioral studies and kinesics. Performing and visual arts share the same\ninterests but focus on documenting and representing human body movements, such\nas for dance notation and visual work creation. This paper investigates body\nmovements in oriental calligraphy and how to apply calligraphy principles to\nstimulate and archive body movements. Through an artwork (Wushu), the authors\nexperiment with an interactive and generative approach to engage the audience's\nbodily participation and archive the body movements as a compendium of\ngenerated calligraphy. The audience assumes the role of both writers and\nreaders; creating (\"writing\") and appreciating (\"reading\") the generated\ncalligraphy becomes a cyclical process within this infinite \"Book,\" which can\nmotivate further attention and discussions concerning Chinese characters and\ncalligraphy.\n","authors":["Aven Le Zhou","Jiayi Ye","Tianchen Liu","Kang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.13770v2.pdf","comment":"8 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.15581v1","updated":"2023-11-27T07:19:10Z","published":"2023-11-27T07:19:10Z","title":"Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras\n from Wide-Angle Monocular Video Recordings","summary":" Eliminating time-consuming post-production processes and delivering\nhigh-quality videos in today's fast-paced digital landscape are the key\nadvantages of real-time approaches. To address these needs, we present Real\nTime GAZED: a real-time adaptation of the GAZED framework integrated with\nCineFilter, a novel real-time camera trajectory stabilization approach. It\nenables users to create professionally edited videos in real-time. Comparative\nevaluations against baseline methods, including the non-real-time GAZED,\ndemonstrate that Real Time GAZED achieves similar editing results, ensuring\nhigh-quality video output. Furthermore, a user study confirms the aesthetic\nquality of the video edits produced by the Real Time GAZED approach. With these\nadvancements in real-time camera trajectory optimization and video editing\npresented, the demand for immediate and dynamic content creation in industries\nsuch as live broadcasting, sports coverage, news reporting, and social media\ncontent creation can be met more efficiently.\n","authors":["Sudheer Achary","Rohit Girmaji","Adhiraj Anil Deshmukh","Vineet Gandhi"],"pdf_url":"https://arxiv.org/pdf/2311.15581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15540v1","updated":"2023-11-27T05:10:15Z","published":"2023-11-27T05:10:15Z","title":"EAFP-Med: An Efficient Adaptive Feature Processing Module Based on\n Prompts for Medical Image Detection","summary":" In the face of rapid advances in medical imaging, cross-domain adaptive\nmedical image detection is challenging due to the differences in lesion\nrepresentations across various medical imaging technologies. To address this\nissue, we draw inspiration from large language models to propose EAFP-Med, an\nefficient adaptive feature processing module based on prompts for medical image\ndetection. EAFP-Med can efficiently extract lesion features of different scales\nfrom a diverse range of medical images based on prompts while being flexible\nand not limited by specific imaging techniques. Furthermore, it serves as a\nfeature preprocessing module that can be connected to any model front-end to\nenhance the lesion features in input images. Moreover, we propose a novel\nadaptive disease detection model named EAFP-Med ST, which utilizes the Swin\nTransformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med.\nWe have compared our method to nine state-of-the-art methods. Experimental\nresults demonstrate that EAFP-Med ST achieves the best performance on all three\ndatasets (chest X-ray images, cranial magnetic resonance imaging images, and\nskin images). EAFP-Med can efficiently extract lesion features from various\nmedical images based on prompts, enhancing the model's performance. This holds\nsignificant potential for improving medical image analysis and diagnosis.\n","authors":["Xiang Li","Long Lan","Husam Lahza","Shaowu Yang","Shuihua Wang","Wenjing Yang","Hengzhu Liu","Yudong Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.15540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15480v1","updated":"2023-11-27T01:44:02Z","published":"2023-11-27T01:44:02Z","title":"Automatic Time Signature Determination for New Scores Using Lyrics for\n Latent Rhythmic Structure","summary":" There has recently been a sharp increase in interest in Artificial\nIntelligence-Generated Content (AIGC). Despite this, musical components such as\ntime signatures have not been studied sufficiently to form an algorithmic\ndetermination approach for new compositions, especially lyrical songs. This is\nlikely because of the neglect of musical details, which is critical for\nconstructing a robust framework. Specifically, time signatures establish the\nfundamental rhythmic structure for almost all aspects of a song, including the\nphrases and notes. In this paper, we propose a novel approach that only uses\nlyrics as input to automatically generate a fitting time signature for lyrical\nsongs and uncover the latent rhythmic structure utilizing explainable machine\nlearning models. In particular, we devise multiple methods that are associated\nwith discovering lyrical patterns and creating new features that simultaneously\ncontain lyrical, rhythmic, and statistical information. In this approach, the\nbest of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under\nthe Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In\nconclusion, our research directly generates time signatures from lyrics\nautomatically for new scores utilizing machine learning, which is an innovative\nidea that approaches an understudied component of musicology and therefore\ncontributes significantly to the future of Artificial Intelligence (AI) music\ngeneration.\n","authors":["Callie C. Liao","Duoduo Liao","Jesse Guessford"],"pdf_url":"https://arxiv.org/pdf/2311.15480v1.pdf","comment":"Submitted to IEEE Big Data 2023 Conference"}]},"2023-11-26T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.15451v1","updated":"2023-11-26T22:47:54Z","published":"2023-11-26T22:47:54Z","title":"Uncertainty-aware Language Modeling for Selective Question Answering","summary":" We present an automatic large language model (LLM) conversion approach that\nproduces uncertainty-aware LLMs capable of estimating uncertainty with every\nprediction. Our approach is model- and data-agnostic, is\ncomputationally-efficient, and does not rely on external models or systems. We\nevaluate converted models on the selective question answering setting -- to\nanswer as many questions as possible while maintaining a given accuracy,\nforgoing providing predictions when necessary. As part of our results, we test\nBERT and Llama 2 model variants on the SQuAD extractive QA task and the\nTruthfulQA generative QA task. We show that using the uncertainty estimates\nprovided by our approach to selectively answer questions leads to significantly\nhigher accuracy over directly using model probabilities.\n","authors":["Qi Yang","Shreya Ravikumar","Fynn Schmitt-Ulms","Satvik Lolla","Ege Demir","Iaroslav Elistratov","Alex Lavaee","Sadhana Lolla","Elaheh Ahmadi","Daniela Rus","Alexander Amini","Alejandro Perez"],"pdf_url":"https://arxiv.org/pdf/2311.15451v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.04947v2","updated":"2023-11-26T22:00:36Z","published":"2023-04-11T03:17:37Z","title":"Conditional Adapters: Parameter-efficient Transfer Learning with Fast\n Inference","summary":" We propose Conditional Adapter (CoDA), a parameter-efficient transfer\nlearning method that also improves inference efficiency. CoDA generalizes\nbeyond standard adapter approaches to enable a new way of balancing speed and\naccuracy using conditional computation. Starting with an existing dense\npretrained model, CoDA adds sparse activation together with a small number of\nnew parameters and a light-weight training phase. Our experiments demonstrate\nthat the CoDA approach provides an unexpectedly efficient way to transfer\nknowledge. Across a variety of language, vision, and speech tasks, CoDA\nachieves a 2x to 8x inference speed-up compared to the state-of-the-art Adapter\napproaches with moderate to no accuracy loss and the same parameter efficiency.\n","authors":["Tao Lei","Junwen Bai","Siddhartha Brahma","Joshua Ainslie","Kenton Lee","Yanqi Zhou","Nan Du","Vincent Y. Zhao","Yuexin Wu","Bo Li","Yu Zhang","Ming-Wei Chang"],"pdf_url":"https://arxiv.org/pdf/2304.04947v2.pdf","comment":"NeurIPS camera ready version"},{"id":"http://arxiv.org/abs/2311.15436v1","updated":"2023-11-26T21:45:53Z","published":"2023-11-26T21:45:53Z","title":"Learning to Skip for Language Modeling","summary":" Overparameterized large-scale language models have impressive generalization\nperformance of in-context few-shot learning. However, most language models\nallocate the same amount of parameters or computation to each token,\ndisregarding the complexity or importance of the input data. We argue that in\nlanguage model pretraining, a variable amount of computation should be assigned\nto different tokens, and this can be efficiently achieved via a simple routing\nmechanism. Different from conventional early stopping techniques where tokens\ncan early exit at only early layers, we propose a more general method that\ndynamically skips the execution of a layer (or module) for any input token with\na binary router. In our extensive evaluation across 24 NLP tasks, we\ndemonstrate that the proposed method can significantly improve the 1-shot\nperformance compared to other competitive baselines only at mild extra cost for\ninference.\n","authors":["Dewen Zeng","Nan Du","Tao Wang","Yuanzhong Xu","Tao Lei","Zhifeng Chen","Claire Cui"],"pdf_url":"https://arxiv.org/pdf/2311.15436v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03081v2","updated":"2023-11-26T21:40:00Z","published":"2023-06-05T17:55:05Z","title":"Sequential Monte Carlo Steering of Large Language Models using\n Probabilistic Programs","summary":" Even after fine-tuning and reinforcement learning, large language models\n(LLMs) can be difficult, if not impossible, to control reliably with prompts\nalone. We propose a new inference-time approach to enforcing syntactic and\nsemantic constraints on the outputs of LLMs, called sequential Monte Carlo\n(SMC) steering. The key idea is to specify language generation tasks as\nposterior inference problems in a class of discrete probabilistic sequence\nmodels, and replace standard decoding with sequential Monte Carlo inference.\nFor a computational cost similar to that of beam search, SMC can steer LLMs to\nsolve diverse tasks, including infilling, generation under syntactic\nconstraints, and prompt intersection. To facilitate experimentation with SMC\nsteering, we present a probabilistic programming library, LLaMPPL\n(https://github.com/probcomp/hfppl), for concisely specifying new generation\ntasks as language model probabilistic programs, and automating steering of\nLLaMA-family Transformers.\n","authors":["Alexander K. Lew","Tan Zhi-Xuan","Gabriel Grand","Vikash K. Mansinghka"],"pdf_url":"https://arxiv.org/pdf/2306.03081v2.pdf","comment":"Minor typo fixes"},{"id":"http://arxiv.org/abs/2311.15425v1","updated":"2023-11-26T21:16:01Z","published":"2023-11-26T21:16:01Z","title":"Machine-Generated Text Detection using Deep Learning","summary":" Our research focuses on the crucial challenge of discerning text produced by\nLarge Language Models (LLMs) from human-generated text, which holds\nsignificance for various applications. With ongoing discussions about attaining\na model with such functionality, we present supporting evidence regarding the\nfeasibility of such models. We evaluated our models on multiple datasets,\nincluding Twitter Sentiment, Football Commentary, Project Gutenberg, PubMedQA,\nand SQuAD, confirming the efficacy of the enhanced detection approaches. These\ndatasets were sampled with intricate constraints encompassing every\npossibility, laying the foundation for future research. We evaluate\nGPT-3.5-Turbo against various detectors such as SVM, RoBERTa-base, and\nRoBERTa-large. Based on the research findings, the results predominantly relied\non the sequence length of the sentence.\n","authors":["Raghav Gaggar","Ashish Bhagchandani","Harsh Oza"],"pdf_url":"https://arxiv.org/pdf/2311.15425v1.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2311.15402v1","updated":"2023-11-26T19:56:19Z","published":"2023-11-26T19:56:19Z","title":"Learning Section Weights for Multi-Label Document Classification","summary":" Multi-label document classification is a traditional task in NLP. Compared to\nsingle-label classification, each document can be assigned multiple classes.\nThis problem is crucially important in various domains, such as tagging\nscientific articles. Documents are often structured into several sections such\nas abstract and title. Current approaches treat different sections equally for\nmulti-label classification. We argue that this is not a realistic assumption,\nleading to sub-optimal results. Instead, we propose a new method called\nLearning Section Weights (LSW), leveraging the contribution of each distinct\nsection for multi-label classification. Via multiple feed-forward layers, LSW\nlearns to assign weights to each section of, and incorporate the weights in the\nprediction. We demonstrate our approach on scientific articles. Experimental\nresults on public (arXiv) and private (Elsevier) datasets confirm the\nsuperiority of LSW, compared to state-of-the-art multi-label document\nclassification methods. In particular, LSW achieves a 1.3% improvement in terms\nof macro averaged F1-score while it achieves 1.3% in terms of macro averaged\nrecall on the publicly available arXiv dataset.\n","authors":["Maziar Moradi Fard","Paula Sorrolla Bayod","Kiomars Motarjem","Mohammad Alian Nejadi","Saber Akhondi","Camilo Thorne"],"pdf_url":"https://arxiv.org/pdf/2311.15402v1.pdf","comment":"7 pages, 4 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.14930v2","updated":"2023-11-26T18:36:30Z","published":"2023-05-24T09:13:15Z","title":"In-Context Impersonation Reveals Large Language Models' Strengths and\n Biases","summary":" In everyday conversations, humans can take on different roles and adapt their\nvocabulary to their chosen roles. We explore whether LLMs can take on, that is\nimpersonate, different roles when they generate text in-context. We ask LLMs to\nassume different personas before solving vision and language tasks. We do this\nby prefixing the prompt with a persona that is associated either with a social\nidentity or domain expertise. In a multi-armed bandit task, we find that LLMs\npretending to be children of different ages recover human-like developmental\nstages of exploration. In a language-based reasoning task, we find that LLMs\nimpersonating domain experts perform better than LLMs impersonating non-domain\nexperts. Finally, we test whether LLMs' impersonations are complementary to\nvisual information when describing different categories. We find that\nimpersonation can improve performance: an LLM prompted to be a bird expert\ndescribes birds better than one prompted to be a car expert. However,\nimpersonation can also uncover LLMs' biases: an LLM prompted to be a man\ndescribes cars better than one prompted to be a woman. These findings\ndemonstrate that LLMs are capable of taking on diverse roles and that this\nin-context impersonation can be used to uncover their hidden strengths and\nbiases.\n","authors":["Leonard Salewski","Stephan Alaniz","Isabel Rio-Torto","Eric Schulz","Zeynep Akata"],"pdf_url":"https://arxiv.org/pdf/2305.14930v2.pdf","comment":"Published in NeurIPS 2023 (Spotlight)"},{"id":"http://arxiv.org/abs/2212.06373v7","updated":"2023-11-26T17:24:19Z","published":"2022-12-13T05:12:40Z","title":"InferEM: Inferring the Speaker's Intention for Empathetic Dialogue\n Generation","summary":" Current approaches to empathetic response generation typically encode the\nentire dialogue history directly and put the output into a decoder to generate\nfriendly feedback. These methods focus on modelling contextual information but\nneglect capturing the direct intention of the speaker. We argue that the last\nutterance in the dialogue empirically conveys the intention of the speaker.\nConsequently, we propose a novel model named InferEM for empathetic response\ngeneration. We separately encode the last utterance and fuse it with the entire\ndialogue through the multi-head attention based intention fusion module to\ncapture the speaker's intention. Besides, we utilize previous utterances to\npredict the last utterance, which simulates human's psychology to guess what\nthe interlocutor may speak in advance. To balance the optimizing rates of the\nutterance prediction and response generation, a multi-task learning strategy is\ndesigned for InferEM. Experimental results demonstrate the plausibility and\nvalidity of InferEM in improving empathetic expression.\n","authors":["Guoqing Lv","Jiang Li","Xiaoping Wang","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2212.06373v7.pdf","comment":"Accepted by the 45th Annual Meeting of the Cognitive Science Society\n (CogSci 2023)"},{"id":"http://arxiv.org/abs/2311.05419v2","updated":"2023-11-26T17:12:20Z","published":"2023-11-09T14:58:46Z","title":"Mirror: A Universal Framework for Various Information Extraction Tasks","summary":" Sharing knowledge between information extraction tasks has always been a\nchallenge due to the diverse data formats and task variations. Meanwhile, this\ndivergence leads to information waste and increases difficulties in building\ncomplex applications in real scenarios. Recent studies often formulate IE tasks\nas a triplet extraction problem. However, such a paradigm does not support\nmulti-span and n-ary extraction, leading to weak versatility. To this end, we\nreorganize IE problems into unified multi-slot tuples and propose a universal\nframework for various IE tasks, namely Mirror. Specifically, we recast existing\nIE tasks as a multi-span cyclic graph extraction problem and devise a\nnon-autoregressive graph decoding algorithm to extract all spans in a single\nstep. It is worth noting that this graph structure is incredibly versatile, and\nit supports not only complex IE tasks, but also machine reading comprehension\nand classification tasks. We manually construct a corpus containing 57 datasets\nfor model pretraining, and conduct experiments on 30 datasets across 8\ndownstream tasks. The experimental results demonstrate that our model has\ndecent compatibility and outperforms or reaches competitive performance with\nSOTA systems under few-shot and zero-shot settings. The code, model weights,\nand pretraining corpus are available at https://github.com/Spico197/Mirror .\n","authors":["Tong Zhu","Junfei Ren","Zijian Yu","Mengsong Wu","Guoliang Zhang","Xiaoye Qu","Wenliang Chen","Zhefeng Wang","Baoxing Huai","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.05419v2.pdf","comment":"Accepted to EMNLP23 main conference"},{"id":"http://arxiv.org/abs/2305.09620v2","updated":"2023-11-26T16:25:49Z","published":"2023-05-16T17:13:07Z","title":"AI-Augmented Surveys: Leveraging Large Language Models and Surveys for\n Opinion Prediction","summary":" Large language models (LLMs) that produce human-like responses have begun to\nrevolutionize research practices in the social sciences. This paper shows how\nwe can integrate LLMs and social surveys to accurately predict individual\nresponses to survey questions that were not asked before. We develop a novel\nmethodological framework to personalize LLMs by considering the meaning of\nsurvey questions derived from their text, the latent beliefs of individuals\ninferred from their response patterns, and the temporal contexts across\ndifferent survey periods through fine-tuning LLMs with survey data. Using the\nGeneral Social Survey from 1972 to 2021, we show that the fine-tuned model\nbased on Alpaca-7b can predict individual responses to survey questions that\nare partially missing as well as entirely missing. The remarkable prediction\ncapabilities allow us to fill in missing trends with high confidence and\npinpoint when public attitudes changed, such as the rising support for same-sex\nmarriage. We discuss practical constraints, socio-demographic representation,\nand ethical concerns regarding individual autonomy and privacy when using LLMs\nfor opinion prediction. This study demonstrates that LLMs and surveys can\nmutually enhance each other's capabilities: LLMs broaden survey potential,\nwhile surveys improve the alignment of LLMs.\n","authors":["Junsol Kim","Byungkyu Lee"],"pdf_url":"https://arxiv.org/pdf/2305.09620v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11435v2","updated":"2023-11-26T15:05:58Z","published":"2023-11-19T22:14:48Z","title":"Unveiling Public Perceptions: Machine Learning-Based Sentiment Analysis\n of COVID-19 Vaccines in India","summary":" In March 2020, the World Health Organisation declared COVID-19 a global\npandemic as it spread to nearly every country. By mid-2021, India had\nintroduced three vaccines: Covishield, Covaxin, and Sputnik. To ensure\nsuccessful vaccination in a densely populated country like India, understanding\npublic sentiment was crucial. Social media, particularly Reddit with over 430\nmillion users, played a vital role in disseminating information. This study\nemploys data mining techniques to analyze Reddit data and gauge Indian\nsentiments towards COVID-19 vaccines. Using Python's Text Blob library,\ncomments are annotated to assess general sentiments. Results show that most\nReddit users in India expressed neutrality about vaccination, posing a\nchallenge for the Indian government's efforts to vaccinate a significant\nportion of the population.\n","authors":["Milind Gupta","Abhishek Kaushik"],"pdf_url":"https://arxiv.org/pdf/2311.11435v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15316v1","updated":"2023-11-26T14:35:23Z","published":"2023-11-26T14:35:23Z","title":"Enhancing Empathetic and Emotion Support Dialogue Generation with\n Prophetic Commonsense Inference","summary":" The interest in Empathetic and Emotional Support conversations among the\npublic has significantly increased. To offer more sensitive and understanding\nresponses, leveraging commonsense knowledge has become a common strategy to\nbetter understand psychological aspects and causality. However, such\ncommonsense inferences can be out of context and unable to predict upcoming\ndialogue themes, resulting in responses that lack coherence and empathy. To\nremedy this issue, we present Prophetic Commonsense Inference, an innovative\nparadigm for inferring commonsense knowledge. By harnessing the capabilities of\nLarge Language Models in understanding dialogue and making commonsense\ndeductions, we train tunable models to bridge the gap between past and\npotential future dialogues. Extensive experiments conducted on\nEmpatheticDialogues and Emotion Support Conversation show that equipping\ndialogue agents with our proposed prophetic commonsense inference significantly\nenhances the quality of their responses.\n","authors":["Lanrui Wang","Jiangnan Li","Chenxu Yang","Zheng Lin","Weiping Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15296v1","updated":"2023-11-26T13:42:56Z","published":"2023-11-26T13:42:56Z","title":"UHGEval: Benchmarking the Hallucination of Chinese Large Language Models\n via Unconstrained Generation","summary":" Large language models (LLMs) have emerged as pivotal contributors in\ncontemporary natural language processing and are increasingly being applied\nacross a diverse range of industries. However, these large-scale probabilistic\nstatistical models cannot currently ensure the requisite quality in\nprofessional content generation. These models often produce hallucinated text,\ncompromising their practical utility in professional contexts. To assess the\nauthentic reliability of LLMs in text generation, numerous initiatives have\ndeveloped benchmark evaluations for hallucination phenomena. Nevertheless,\nthese benchmarks frequently utilize constrained generation techniques due to\ncost and temporal constraints. These techniques encompass the use of directed\nhallucination induction and strategies that deliberately alter authentic text\nto produce hallucinations. These approaches are not congruent with the\nunrestricted text generation demanded by real-world applications. Furthermore,\na well-established Chinese-language dataset dedicated to the evaluation of\nhallucinations in text generation is presently lacking. Consequently, we have\ndeveloped an Unconstrained Hallucination Generation Evaluation (UHGEval)\nbenchmark, designed to compile outputs produced with minimal restrictions by\nLLMs. Concurrently, we have established a comprehensive benchmark evaluation\nframework to aid subsequent researchers in undertaking scalable and\nreproducible experiments. We have also executed extensive experiments,\nevaluating prominent Chinese language models and the GPT series models to\nderive professional performance insights regarding hallucination challenges.\n","authors":["Xun Liang","Shichao Song","Simin Niu","Zhiyu Li","Feiyu Xiong","Bo Tang","Zhaohui Wy","Dawei He","Peng Cheng","Zhonghao Wang","Haiying Deng"],"pdf_url":"https://arxiv.org/pdf/2311.15296v1.pdf","comment":"13 Pages, submitted to ICDE2024"},{"id":"http://arxiv.org/abs/2307.11845v2","updated":"2023-11-26T08:57:44Z","published":"2023-07-21T18:29:04Z","title":"Multimodal Document Analytics for Banking Process Automation","summary":" Traditional banks face increasing competition from FinTechs in the rapidly\nevolving financial ecosystem. Raising operational efficiency is vital to\naddress this challenge. Our study aims to improve the efficiency of\ndocument-intensive business processes in banking. To that end, we first review\nthe landscape of business documents in the retail segment. Banking documents\noften contain text, layout, and visuals, suggesting that document analytics and\nprocess automation require more than plain natural language processing (NLP).\nTo verify this and assess the incremental value of visual cues when processing\nbusiness documents, we compare a recently proposed multimodal model called\nLayoutXLM to powerful text classifiers (e.g., BERT) and large language models\n(e.g., GPT) in a case study related to processing company register extracts.\nThe results confirm that incorporating layout information in a model\nsubstantially increases its performance. Interestingly, we also observed that\nmore than 75% of the best model performance (in terms of the F1 score) can be\nachieved with as little as 30% of the training data. This shows that the demand\nfor data labeled data to set up a multi-modal model can be moderate, which\nsimplifies real-world applications of multimodal document analytics. Our study\nalso sheds light on more specific practices in the scope of calibrating a\nmultimodal banking document classifier, including the need for fine-tuning. In\nsum, the paper contributes original empirical evidence on the effectiveness and\nefficiency of multi-model models for document processing in the banking\nbusiness and offers practical guidance on how to unlock this potential in\nday-to-day operations.\n","authors":["Christopher Gerling","Stefan Lessmann"],"pdf_url":"https://arxiv.org/pdf/2307.11845v2.pdf","comment":"A Preprint"},{"id":"http://arxiv.org/abs/2311.15218v1","updated":"2023-11-26T07:19:10Z","published":"2023-11-26T07:19:10Z","title":"Dataset for Stock Market Forecasting Based on Quantitative Analysis and\n Qualitative Data","summary":" The application of Machine learning to finance has become a familiar\napproach, even more so in stock market forecasting. The stock market is highly\nvolatile and huge amounts of data are generated every minute globally. The\nextraction of effective intelligence from this data is of critical importance.\nHowever, a collaboration of numerical stock data with qualitative text data can\nbe a challenging task. In this work, we accomplish this and provide an\nunprecedented, publicly available dataset with technical and fundamental data,\nsentiment that we gathered from News Archives, TV news captions, Radio\nTranscripts, Tweets, Daily financial newspapers, etc. The text data entries\nused for sentiment extraction total more than 1.4 Million. The dataset\ncomprises of daily entries from January 2018 to December 2022 for 8 different\ncompanies and Dow Jones Index as a whole. Holistic Fundamental and Technical\ndata is provided training ready for Model learning and deployment. The\npredictive power of deep learning models is highly determined by the training\ndata provided. This dataset would be of benefit for research globally\nincorporating qualitative intelligence for stock market forecasting. The\ndataset is made available at https://github.com/batking24/Huge-Stock-Dataset.\n","authors":["Sai Akash Bathini","Dagli Cihan"],"pdf_url":"https://arxiv.org/pdf/2311.15218v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15211v1","updated":"2023-11-26T06:56:02Z","published":"2023-11-26T06:56:02Z","title":"Probabilistic Transformer: A Probabilistic Dependency Model for\n Contextual Word Representation","summary":" Syntactic structures used to play a vital role in natural language processing\n(NLP), but since the deep learning revolution, NLP has been gradually dominated\nby neural models that do not consider syntactic structures in their design. One\nvastly successful class of neural models is transformers. When used as an\nencoder, a transformer produces contextual representation of words in the input\nsentence. In this work, we propose a new model of contextual word\nrepresentation, not from a neural perspective, but from a purely syntactic and\nprobabilistic perspective. Specifically, we design a conditional random field\nthat models discrete latent representations of all words in a sentence as well\nas dependency arcs between them; and we use mean field variational inference\nfor approximate inference. Strikingly, we find that the computation graph of\nour model resembles transformers, with correspondences between dependencies and\nself-attention and between distributions over latent representations and\ncontextual embeddings of words. Experiments show that our model performs\ncompetitively to transformers on small to medium sized datasets. We hope that\nour work could help bridge the gap between traditional syntactic and\nprobabilistic approaches and cutting-edge neural approaches to NLP, and inspire\nmore linguistically-principled neural approaches in the future.\n","authors":["Haoyi Wu","Kewei Tu"],"pdf_url":"https://arxiv.org/pdf/2311.15211v1.pdf","comment":"Accepted to ACL2023 Findings"},{"id":"http://arxiv.org/abs/2311.15208v1","updated":"2023-11-26T06:24:25Z","published":"2023-11-26T06:24:25Z","title":"LongStory: Coherent, Complete and Length Controlled Long story\n Generation","summary":" A human author can write any length of story without losing coherence. Also,\nthey always bring the story to a proper ending, an ability that current\nlanguage models lack. In this work, we present the LongStory for coherent,\ncomplete, and length-controlled long story generation. LongStory introduces two\nnovel methodologies: (1) the long and short-term contexts weight calibrator\n(CWC) and (2) long story structural positions (LSP). The CWC adjusts weights\nfor long-term context Memory and short-term context Cheating, acknowledging\ntheir distinct roles. The LSP employs discourse tokens to convey the structural\npositions of a long story. Trained on three datasets with varied average story\nlengths, LongStory outperforms other baselines, including the strong story\ngenerator Plotmachine, in coherence, completeness, relevance, and\nrepetitiveness. We also perform zero-shot tests on each dataset to assess the\nmodel's ability to predict outcomes beyond its training data and validate our\nmethodology by comparing its performance with variants of our model.\n","authors":["Kyeongman Park","Nakyeong Yang","Kyomin Jung"],"pdf_url":"https://arxiv.org/pdf/2311.15208v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15198v1","updated":"2023-11-26T05:34:22Z","published":"2023-11-26T05:34:22Z","title":"ChatGPT and Beyond: The Generative AI Revolution in Education","summary":" The wide adoption and usage of generative artificial intelligence (AI)\nmodels, particularly ChatGPT, has sparked a surge in research exploring their\npotential applications in the educational landscape. This survey examines\nacademic literature published between November, 2022, and July, 2023,\nspecifically targeting high-impact research from Scopus-indexed Q1 and Q2\njournals. This survey delves into the practical applications and implications\nof generative AI models across a diverse range of educational contexts. Through\na comprehensive and rigorous evaluation of recent academic literature, this\nsurvey seeks to illuminate the evolving role of generative AI models,\nparticularly ChatGPT, in education. By shedding light on the potential\nbenefits, challenges, and emerging trends in this dynamic field, the survey\nendeavors to contribute to the understanding of the nexus between artificial\nintelligence and education. The findings of this review will empower educators,\nresearchers, and policymakers to make informed decisions about the integration\nof AI technologies into learning environments.\n","authors":["Mohammad AL-Smadi"],"pdf_url":"https://arxiv.org/pdf/2311.15198v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10013v2","updated":"2023-11-26T05:05:51Z","published":"2022-12-20T06:01:13Z","title":"DocAsRef: An Empirical Study on Repurposing Reference-Based Summary\n Quality Metrics Reference-Freely","summary":" Automated summary quality assessment falls into two categories:\nreference-based and reference-free. Reference-based metrics, historically\ndeemed more accurate due to the additional information provided by\nhuman-written references, are limited by their reliance on human input. In this\npaper, we hypothesize that the comparison methodologies used by some\nreference-based metrics to evaluate a system summary against its corresponding\nreference can be effectively adapted to assess it against its source document,\nthereby transforming these metrics into reference-free ones. Experimental\nresults support this hypothesis. After being repurposed reference-freely, the\nzero-shot BERTScore using the pretrained DeBERTa-large-MNLI model of <0.5B\nparameters consistently outperforms its original reference-based version across\nvarious aspects on the SummEval and Newsroom datasets. It also excels in\ncomparison to most existing reference-free metrics and closely competes with\nzero-shot summary evaluators based on GPT-3.5.\n","authors":["Forrest Sheng Bao","Ruixuan Tu","Ge Luo","Yinfei Yang","Hebi Li","Minghui Qiu","Youbiao He","Cen Chen"],"pdf_url":"https://arxiv.org/pdf/2212.10013v2.pdf","comment":"Accepted into Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15180v1","updated":"2023-11-26T03:54:03Z","published":"2023-11-26T03:54:03Z","title":"Benchmarking Large Language Model Volatility","summary":" The impact of non-deterministic outputs from Large Language Models (LLMs) is\nnot well examined for financial text understanding tasks. Through a compelling\ncase study on investing in the US equity market via news sentiment analysis, we\nuncover substantial variability in sentence-level sentiment classification\nresults, underscoring the innate volatility of LLM outputs. These uncertainties\ncascade downstream, leading to more significant variations in portfolio\nconstruction and return. While tweaking the temperature parameter in the\nlanguage model decoder presents a potential remedy, it comes at the expense of\nstifled creativity. Similarly, while ensembling multiple outputs mitigates the\neffect of volatile outputs, it demands a notable computational investment. This\nwork furnishes practitioners with invaluable insights for adeptly navigating\nuncertainty in the integration of LLMs into financial decision-making,\nparticularly in scenarios dictated by non-deterministic information.\n","authors":["Boyang Yu"],"pdf_url":"https://arxiv.org/pdf/2311.15180v1.pdf","comment":"7 pages, 2 figures, Workshop on AI Safety and Robustness In Finance,\n ICAIF 2023"},{"id":"http://arxiv.org/abs/2311.06401v3","updated":"2023-11-26T02:42:13Z","published":"2023-11-10T21:32:34Z","title":"Autoregressive Language Models For Estimating the Entropy of Epic EHR\n Audit Logs","summary":" EHR audit logs are a highly granular stream of events that capture clinician\nactivities, and is a significant area of interest for research in\ncharacterizing clinician workflow on the electronic health record (EHR).\nExisting techniques to measure the complexity of workflow through EHR audit\nlogs (audit logs) involve time- or frequency-based cross-sectional aggregations\nthat are unable to capture the full complexity of a EHR session. We briefly\nevaluate the usage of transformer-based tabular language model (tabular LM) in\nmeasuring the entropy or disorderedness of action sequences within workflow and\nrelease the evaluated models publicly.\n","authors":["Benjamin C. Warner","Thomas Kannampallil","Seunghwan Kim"],"pdf_url":"https://arxiv.org/pdf/2311.06401v3.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 10 pages"},{"id":"http://arxiv.org/abs/2206.10498v4","updated":"2023-11-26T01:15:41Z","published":"2022-06-21T16:15:27Z","title":"PlanBench: An Extensible Benchmark for Evaluating Large Language Models\n on Planning and Reasoning about Change","summary":" Generating plans of action, and reasoning about change have long been\nconsidered a core competence of intelligent agents. It is thus no surprise that\nevaluating the planning and reasoning capabilities of large language models\n(LLMs) has become a hot topic of research. Most claims about LLM planning\ncapabilities are however based on common sense tasks-where it becomes hard to\ntell whether LLMs are planning or merely retrieving from their vast world\nknowledge. There is a strong need for systematic and extensible planning\nbenchmarks with sufficient diversity to evaluate whether LLMs have innate\nplanning capabilities. Motivated by this, we propose PlanBench, an extensible\nbenchmark suite based on the kinds of domains used in the automated planning\ncommunity, especially in the International Planning Competition, to test the\ncapabilities of LLMs in planning or reasoning about actions and change.\nPlanBench provides sufficient diversity in both the task domains and the\nspecific planning capabilities. Our studies also show that on many critical\ncapabilities-including plan generation-LLM performance falls quite short, even\nwith the SOTA models. PlanBench can thus function as a useful marker of\nprogress of LLMs in planning and reasoning.\n","authors":["Karthik Valmeekam","Matthew Marquez","Alberto Olmo","Sarath Sreedharan","Subbarao Kambhampati"],"pdf_url":"https://arxiv.org/pdf/2206.10498v4.pdf","comment":"NeurIPS 2023 Track on Datasets and Benchmarks"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2304.08138v2","updated":"2023-11-26T23:52:43Z","published":"2023-04-17T10:42:30Z","title":"Typos-aware Bottlenecked Pre-Training for Robust Dense Retrieval","summary":" Current dense retrievers (DRs) are limited in their ability to effectively\nprocess misspelled queries, which constitute a significant portion of query\ntraffic in commercial search engines. The main issue is that the pre-trained\nlanguage model-based encoders used by DRs are typically trained and fine-tuned\nusing clean, well-curated text data. Misspelled queries are typically not found\nin the data used for training these models, and thus misspelled queries\nobserved at inference time are out-of-distribution compared to the data used\nfor training and fine-tuning. Previous efforts to address this issue have\nfocused on \\textit{fine-tuning} strategies, but their effectiveness on\nmisspelled queries remains lower than that of pipelines that employ separate\nstate-of-the-art spell-checking components. To address this challenge, we\npropose ToRoDer (TypOs-aware bottlenecked pre-training for RObust DEnse\nRetrieval), a novel re-training strategy for DRs that increases their\nrobustness to misspelled queries while preserving their effectiveness in\ndownstream retrieval tasks. ToRoDer utilizes an encoder-decoder architecture\nwhere the encoder takes misspelled text with masked tokens as input and outputs\nbottlenecked information to the decoder. The decoder then takes as input the\nbottlenecked embeddings, along with token embeddings of the original text with\nthe misspelled tokens masked out. The pre-training task is to recover the\nmasked tokens for both the encoder and decoder. Our extensive experimental\nresults and detailed ablation studies show that DRs pre-trained with ToRoDer\nexhibit significantly higher effectiveness on misspelled queries, sensibly\nclosing the gap with pipelines that use a separate, complex spell-checker\ncomponent, while retaining their effectiveness on correctly spelled queries.\n","authors":["Shengyao Zhuang","Linjun Shou","Jian Pei","Ming Gong","Houxing Ren","Guido Zuccon","Daxin Jiang"],"pdf_url":"https://arxiv.org/pdf/2304.08138v2.pdf","comment":"10 pages, accepted at SIGIR-AP"},{"id":"http://arxiv.org/abs/2311.15426v1","updated":"2023-11-26T21:16:12Z","published":"2023-11-26T21:16:12Z","title":"Data Augmentation for Sample Efficient and Robust Document Ranking","summary":" Contextual ranking models have delivered impressive performance improvements\nover classical models in the document ranking task. However, these highly\nover-parameterized models tend to be data-hungry and require large amounts of\ndata even for fine-tuning. In this paper, we propose data-augmentation methods\nfor effective and robust ranking performance. One of the key benefits of using\ndata augmentation is in achieving sample efficiency or learning effectively\nwhen we have only a small amount of training data. We propose supervised and\nunsupervised data augmentation schemes by creating training data using parts of\nthe relevant documents in the query-document pairs. We then adapt a family of\ncontrastive losses for the document ranking task that can exploit the augmented\ndata to learn an effective ranking model. Our extensive experiments on subsets\nof the MS MARCO and TREC-DL test sets show that data augmentation, along with\nthe ranking-adapted contrastive losses, results in performance improvements\nunder most dataset sizes. Apart from sample efficiency, we conclusively show\nthat data augmentation results in robust models when transferred to\nout-of-domain benchmarks. Our performance improvements in in-domain and more\nprominently in out-of-domain benchmarks show that augmentation regularizes the\nranking model and improves its robustness and generalization capability.\n","authors":["Abhijit Anand","Jurek Leonhardt","Jaspreet Singh","Koustav Rudra","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2311.15426v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19453v2","updated":"2023-11-26T12:40:27Z","published":"2023-10-30T11:25:03Z","title":"FLIP: Towards Fine-grained Alignment between ID-based Models and\n Pretrained Language Models for CTR Prediction","summary":" Click-through rate (CTR) prediction plays as a core function module in\nvarious personalized online services. The traditional ID-based models for CTR\nprediction take as inputs the one-hot encoded ID features of tabular modality,\nwhich capture the collaborative signals via feature interaction modeling. But\nthe one-hot encoding discards the semantic information conceived in the\noriginal feature texts. Recently, the emergence of Pretrained Language Models\n(PLMs) has given rise to another paradigm, which takes as inputs the sentences\nof textual modality obtained by hard prompt templates and adopts PLMs to\nextract the semantic knowledge. However, PLMs generally tokenize the input text\ndata into subword tokens and ignore field-wise collaborative signals.\nTherefore, these two lines of research focus on different characteristics of\nthe same input data (i.e., textual and tabular modalities), forming a distinct\ncomplementary relationship with each other. In this paper, we propose to\nconduct Fine-grained feature-level ALignment between ID-based Models and\nPretrained Language Models (FLIP) for CTR prediction. We design a novel joint\nreconstruction pretraining task for both masked language and tabular modeling.\nSpecifically, the masked data of one modality (i.e., tokens or features) has to\nbe recovered with the help of the other modality, which establishes the\nfeature-level interaction and alignment via sufficient mutual information\nextraction between dual modalities. Moreover, we propose to jointly finetune\nthe ID-based model and PLM for downstream CTR prediction tasks, thus achieving\nsuperior performance by combining the advantages of both models. Extensive\nexperiments on three real-world datasets demonstrate that FLIP outperforms SOTA\nbaselines, and is highly compatible for various ID-based models and PLMs.\n","authors":["Hangyu Wang","Jianghao Lin","Xiangyang Li","Bo Chen","Chenxu Zhu","Ruiming Tang","Weinan Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2310.19453v2.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2308.10778v2","updated":"2023-11-26T11:32:59Z","published":"2023-08-21T15:09:19Z","title":"A Topology-aware Analysis of Graph Collaborative Filtering","summary":" The successful integration of graph neural networks into recommender systems\n(RSs) has led to a novel paradigm in collaborative filtering (CF), graph\ncollaborative filtering (graph CF). By representing user-item data as an\nundirected, bipartite graph, graph CF utilizes short- and long-range\nconnections to extract collaborative signals that yield more accurate user\npreferences than traditional CF methods. Although the recent literature\nhighlights the efficacy of various algorithmic strategies in graph CF, the\nimpact of datasets and their topological features on recommendation performance\nis yet to be studied. To fill this gap, we propose a topology-aware analysis of\ngraph CF. In this study, we (i) take some widely-adopted recommendation\ndatasets and use them to generate a large set of synthetic sub-datasets through\ntwo state-of-the-art graph sampling methods, (ii) measure eleven of their\nclassical and topological characteristics, and (iii) estimate the accuracy\ncalculated on the generated sub-datasets considering four popular and recent\ngraph-based RSs (i.e., LightGCN, DGCF, UltraGCN, and SVD-GCN). Finally, the\ninvestigation presents an explanatory framework that reveals the linear\nrelationships between characteristics and accuracy measures. The results,\nstatistically validated under different graph sampling settings, confirm the\nexistence of solid dependencies between topological characteristics and\naccuracy in the graph-based recommendation, offering a new perspective on how\nto interpret graph CF.\n","authors":["Daniele Malitesta","Claudio Pomo","Vito Walter Anelli","Alberto Carlo Maria Mancino","Eugenio Di Sciascio","Tommaso Di Noia"],"pdf_url":"https://arxiv.org/pdf/2308.10778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14742v1","updated":"2023-11-26T07:34:18Z","published":"2023-11-26T07:34:18Z","title":"Query-LIFE: Query-aware Language Image Fusion Embedding for E-Commerce\n Relevance","summary":" Relevance module plays a fundamental role in e-commerce search as they are\nresponsible for selecting relevant products from thousands of items based on\nuser queries, thereby enhancing users experience and efficiency. The\ntraditional approach models the relevance based product titles and queries, but\nthe information in titles alone maybe insufficient to describe the products\ncompletely. A more general optimization approach is to further leverage product\nimage information. In recent years, vision-language pre-training models have\nachieved impressive results in many scenarios, which leverage contrastive\nlearning to map both textual and visual features into a joint embedding space.\nIn e-commerce, a common practice is to fine-tune on the pre-trained model based\non e-commerce data. However, the performance is sub-optimal because the\nvision-language pre-training models lack of alignment specifically designed for\nqueries. In this paper, we propose a method called Query-LIFE (Query-aware\nLanguage Image Fusion Embedding) to address these challenges. Query-LIFE\nutilizes a query-based multimodal fusion to effectively incorporate the image\nand title based on the product types. Additionally, it employs query-aware\nmodal alignment to enhance the accuracy of the comprehensive representation of\nproducts. Furthermore, we design GenFilt, which utilizes the generation\ncapability of large models to filter out false negative samples and further\nimprove the overall performance of the contrastive learning task in the model.\nExperiments have demonstrated that Query-LIFE outperforms existing baselines.\nWe have conducted ablation studies and human evaluations to validate the\neffectiveness of each module within Query-LIFE. Moreover, Query-LIFE has been\ndeployed on Miravia Search, resulting in improved both relevance and conversion\nefficiency.\n","authors":["Hai Zhu","Yuankai Guo","Ronggang Dou","Kai Liu"],"pdf_url":"https://arxiv.org/pdf/2311.14742v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2302.05543v3","updated":"2023-11-26T22:26:12Z","published":"2023-02-10T23:12:37Z","title":"Adding Conditional Control to Text-to-Image Diffusion Models","summary":" We present ControlNet, a neural network architecture to add spatial\nconditioning controls to large, pretrained text-to-image diffusion models.\nControlNet locks the production-ready large diffusion models, and reuses their\ndeep and robust encoding layers pretrained with billions of images as a strong\nbackbone to learn a diverse set of conditional controls. The neural\narchitecture is connected with \"zero convolutions\" (zero-initialized\nconvolution layers) that progressively grow the parameters from zero and ensure\nthat no harmful noise could affect the finetuning. We test various conditioning\ncontrols, eg, edges, depth, segmentation, human pose, etc, with Stable\nDiffusion, using single or multiple conditions, with or without prompts. We\nshow that the training of ControlNets is robust with small (<50k) and large\n(>1m) datasets. Extensive results show that ControlNet may facilitate wider\napplications to control image diffusion models.\n","authors":["Lvmin Zhang","Anyi Rao","Maneesh Agrawala"],"pdf_url":"https://arxiv.org/pdf/2302.05543v3.pdf","comment":"Codes and Supplementary Material:\n https://github.com/lllyasviel/ControlNet"},{"id":"http://arxiv.org/abs/2307.14335v2","updated":"2023-11-26T14:12:37Z","published":"2023-07-26T17:54:04Z","title":"WavJourney: Compositional Audio Creation with Large Language Models","summary":" Despite breakthroughs in audio generation models, their capabilities are\noften confined to domain-specific conditions such as speech transcriptions and\naudio captions. However, real-world audio creation aims to generate harmonious\naudio containing various elements such as speech, music, and sound effects with\ncontrollable conditions, which is challenging to address using existing audio\ngeneration systems. We present WavJourney, a novel framework that leverages\nLarge Language Models (LLMs) to connect various audio models for audio\ncreation. WavJourney allows users to create storytelling audio content with\ndiverse audio elements simply from textual descriptions. Specifically, given a\ntext instruction, WavJourney first prompts LLMs to generate an audio script\nthat serves as a structured semantic representation of audio elements. The\naudio script is then converted into a computer program, where each line of the\nprogram calls a task-specific audio generation model or computational operation\nfunction. The computer program is then executed to obtain a compositional and\ninterpretable solution for audio creation. Experimental results suggest that\nWavJourney is capable of synthesizing realistic audio aligned with\ntextually-described semantic, spatial and temporal conditions, achieving\nstate-of-the-art results on text-to-audio generation benchmarks. Additionally,\nwe introduce a new multi-genre story benchmark. Subjective evaluations\ndemonstrate the potential of WavJourney in crafting engaging storytelling audio\ncontent from text. We further demonstrate that WavJourney can facilitate\nhuman-machine co-creation in multi-round dialogues. To foster future research,\nthe code and synthesized audio are available at:\nhttps://audio-agi.github.io/WavJourney_demopage/.\n","authors":["Xubo Liu","Zhongkai Zhu","Haohe Liu","Yi Yuan","Meng Cui","Qiushi Huang","Jinhua Liang","Yin Cao","Qiuqiang Kong","Mark D. Plumbley","Wenwu Wang"],"pdf_url":"https://arxiv.org/pdf/2307.14335v2.pdf","comment":"GitHub: https://github.com/Audio-AGI/WavJourney"},{"id":"http://arxiv.org/abs/2311.15230v1","updated":"2023-11-26T08:04:43Z","published":"2023-11-26T08:04:43Z","title":"GAIA: Zero-shot Talking Avatar Generation","summary":" Zero-shot talking avatar generation aims at synthesizing natural talking\nvideos from speech and a single portrait image. Previous methods have relied on\ndomain-specific heuristics such as warping-based motion representation and 3D\nMorphable Models, which limit the naturalness and diversity of the generated\navatars. In this work, we introduce GAIA (Generative AI for Avatar), which\neliminates the domain priors in talking avatar generation. In light of the\nobservation that the speech only drives the motion of the avatar while the\nappearance of the avatar and the background typically remain the same\nthroughout the entire video, we divide our approach into two stages: 1)\ndisentangling each frame into motion and appearance representations; 2)\ngenerating motion sequences conditioned on the speech and reference portrait\nimage. We collect a large-scale high-quality talking avatar dataset and train\nthe model on it with different scales (up to 2B parameters). Experimental\nresults verify the superiority, scalability, and flexibility of GAIA as 1) the\nresulting model beats previous baseline models in terms of naturalness,\ndiversity, lip-sync quality, and visual quality; 2) the framework is scalable\nsince larger models yield better results; 3) it is general and enables\ndifferent applications like controllable talking avatar generation and\ntext-instructed avatar generation.\n","authors":["Tianyu He","Junliang Guo","Runyi Yu","Yuchi Wang","Jialiang Zhu","Kaikai An","Leyi Li","Xu Tan","Chunyu Wang","Han Hu","HsiangTao Wu","Sheng Zhao","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2311.15230v1.pdf","comment":"Project page: https://microsoft.github.io/GAIA/"}]},"2023-11-25T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2311.15131v1","updated":"2023-11-25T22:41:23Z","published":"2023-11-25T22:41:23Z","title":"Localizing Lying in Llama: Understanding Instructed Dishonesty on\n True-False Questions Through Prompting, Probing, and Patching","summary":" Large language models (LLMs) demonstrate significant knowledge through their\noutputs, though it is often unclear whether false outputs are due to a lack of\nknowledge or dishonesty. In this paper, we investigate instructed dishonesty,\nwherein we explicitly prompt LLaMA-2-70b-chat to lie. We perform prompt\nengineering to find which prompts best induce lying behavior, and then use\nmechanistic interpretability approaches to localize where in the network this\nbehavior occurs. Using linear probing and activation patching, we localize five\nlayers that appear especially important for lying. We then find just 46\nattention heads within these layers that enable us to causally intervene such\nthat the lying model instead answers honestly. We show that these interventions\nwork robustly across many prompts and dataset splits. Overall, our work\ncontributes a greater understanding of dishonesty in LLMs so that we may hope\nto prevent it.\n","authors":["James Campbell","Richard Ren","Phillip Guo"],"pdf_url":"https://arxiv.org/pdf/2311.15131v1.pdf","comment":"14 pages, 12 figures"},{"id":"http://arxiv.org/abs/2303.15445v3","updated":"2023-11-25T22:07:55Z","published":"2023-03-27T17:59:55Z","title":"IRFL: Image Recognition of Figurative Language","summary":" Figures of speech such as metaphors, similes, and idioms are integral parts\nof human communication. They are ubiquitous in many forms of discourse,\nallowing people to convey complex, abstract ideas and evoke emotion. As\nfigurative forms are often conveyed through multiple modalities (e.g., both\ntext and images), understanding multimodal figurative language is an important\nAI challenge, weaving together profound vision, language, commonsense and\ncultural knowledge. In this work, we develop the Image Recognition of\nFigurative Language (IRFL) dataset. We leverage human annotation and an\nautomatic pipeline we created to generate a multimodal dataset, and introduce\ntwo novel tasks as a benchmark for multimodal figurative language\nunderstanding. We experimented with state-of-the-art vision and language models\nand found that the best (22%) performed substantially worse than humans (97%).\nWe release our dataset, benchmark, and code, in hopes of driving the\ndevelopment of models that can better understand figurative language.\n","authors":["Ron Yosef","Yonatan Bitton","Dafna Shahaf"],"pdf_url":"https://arxiv.org/pdf/2303.15445v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.04568v2","updated":"2023-11-25T19:52:13Z","published":"2022-08-09T07:15:20Z","title":"The Impact of Data Corruption on Named Entity Recognition for\n Low-resourced Languages","summary":" Data availability and quality are major challenges in natural language\nprocessing for low-resourced languages. In particular, there is significantly\nless data available than for higher-resourced languages. This data is also\noften of low quality, rife with errors, invalid text or incorrect annotations.\nMany prior works focus on dealing with these problems, either by generating\nsynthetic data, or filtering out low-quality parts of datasets. We instead\ninvestigate these factors more deeply, by systematically measuring the effect\nof data quantity and quality on the performance of pre-trained language models\nin a low-resourced setting. Our results show that having fewer\ncompletely-labelled sentences is significantly better than having more\nsentences with missing labels; and that models can perform remarkably well with\nonly 10% of the training data. Importantly, these results are consistent across\nten low-resource languages, English, and four pre-trained models.\n","authors":["Manuel Fokam","Michael Beukman"],"pdf_url":"https://arxiv.org/pdf/2208.04568v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15110v1","updated":"2023-11-25T19:50:41Z","published":"2023-11-25T19:50:41Z","title":"Relevance feedback strategies for recall-oriented neural information\n retrieval","summary":" In a number of information retrieval applications (e.g., patent search,\nliterature review, due diligence, etc.), preventing false negatives is more\nimportant than preventing false positives. However, approaches designed to\nreduce review effort (like \"technology assisted review\") can create false\nnegatives, since they are often based on active learning systems that exclude\ndocuments automatically based on user feedback. Therefore, this research\nproposes a more recall-oriented approach to reducing review effort. More\nspecifically, through iteratively re-ranking the relevance rankings based on\nuser feedback, which is also referred to as relevance feedback. In our proposed\nmethod, the relevance rankings are produced by a BERT-based dense-vector search\nand the relevance feedback is based on cumulatively summing the queried and\nselected embeddings. Our results show that this method can reduce review effort\nbetween 17.85% and 59.04%, compared to a baseline approach (of no feedback),\ngiven a fixed recall target\n","authors":["Timo Kats","Peter van der Putten","Jan Scholtes"],"pdf_url":"https://arxiv.org/pdf/2311.15110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15106v1","updated":"2023-11-25T19:35:53Z","published":"2023-11-25T19:35:53Z","title":"Solving the Right Problem is Key for Translational NLP: A Case Study in\n UMLS Vocabulary Insertion","summary":" As the immense opportunities enabled by large language models become more\napparent, NLP systems will be increasingly expected to excel in real-world\nsettings. However, in many instances, powerful models alone will not yield\ntranslational NLP solutions, especially if the formulated problem is not well\naligned with the real-world task. In this work, we study the case of UMLS\nvocabulary insertion, an important real-world task in which hundreds of\nthousands of new terms, referred to as atoms, are added to the UMLS, one of the\nmost comprehensive open-source biomedical knowledge bases. Previous work aimed\nto develop an automated NLP system to make this time-consuming, costly, and\nerror-prone task more efficient. Nevertheless, practical progress in this\ndirection has been difficult to achieve due to a problem formulation and\nevaluation gap between research output and the real-world task. In order to\naddress this gap, we introduce a new formulation for UMLS vocabulary insertion\nwhich mirrors the real-world task, datasets which faithfully represent it and\nseveral strong baselines we developed through re-purposing existing solutions.\nAdditionally, we propose an effective rule-enhanced biomedical language model\nwhich enables important new model behavior, outperforms all strong baselines\nand provides measurable qualitative improvements to editors who carry out the\nUVI task. We hope this case study provides insight into the considerable\nimportance of problem formulation for the success of translational NLP\nsolutions.\n","authors":["Bernal Jimenez Gutierrez","Yuqing Mao","Vinh Nguyen","Kin Wah Fung","Yu Su","Olivier Bodenreider"],"pdf_url":"https://arxiv.org/pdf/2311.15106v1.pdf","comment":"EMNLP 2023 Findings; Code is available at\n https://github.com/OSU-NLP-Group/UMLS-Vocabulary-Insertion"},{"id":"http://arxiv.org/abs/2310.19736v3","updated":"2023-11-25T17:35:12Z","published":"2023-10-30T17:00:52Z","title":"Evaluating Large Language Models: A Comprehensive Survey","summary":" Large language models (LLMs) have demonstrated remarkable capabilities across\na broad spectrum of tasks. They have attracted significant attention and been\ndeployed in numerous downstream applications. Nevertheless, akin to a\ndouble-edged sword, LLMs also present potential risks. They could suffer from\nprivate data leaks or yield inappropriate, harmful, or misleading content.\nAdditionally, the rapid progress of LLMs raises concerns about the potential\nemergence of superintelligent systems without adequate safeguards. To\neffectively capitalize on LLM capacities as well as ensure their safe and\nbeneficial development, it is critical to conduct a rigorous and comprehensive\nevaluation of LLMs.\n This survey endeavors to offer a panoramic perspective on the evaluation of\nLLMs. We categorize the evaluation of LLMs into three major groups: knowledge\nand capability evaluation, alignment evaluation and safety evaluation. In\naddition to the comprehensive review on the evaluation methodologies and\nbenchmarks on these three aspects, we collate a compendium of evaluations\npertaining to LLMs' performance in specialized domains, and discuss the\nconstruction of comprehensive evaluation platforms that cover LLM evaluations\non capabilities, alignment, safety, and applicability.\n We hope that this comprehensive overview will stimulate further research\ninterests in the evaluation of LLMs, with the ultimate goal of making\nevaluation serve as a cornerstone in guiding the responsible development of\nLLMs. We envision that this will channel their evolution into a direction that\nmaximizes societal benefit while minimizing potential risks. A curated list of\nrelated papers has been publicly available at\nhttps://github.com/tjunlp-lab/Awesome-LLMs-Evaluation-Papers.\n","authors":["Zishan Guo","Renren Jin","Chuang Liu","Yufei Huang","Dan Shi"," Supryadi","Linhao Yu","Yan Liu","Jiaxuan Li","Bojian Xiong","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2310.19736v3.pdf","comment":"111 pages"},{"id":"http://arxiv.org/abs/2305.07637v3","updated":"2023-11-25T17:19:36Z","published":"2023-05-12T17:46:06Z","title":"Text2Cohort: Facilitating Intuitive Access to Biomedical Data with\n Natural Language Cohort Discovery","summary":" The Imaging Data Commons (IDC) is a cloud-based database that provides\nresearchers with open access to cancer imaging data, with the goal of\nfacilitating collaboration. However, cohort discovery within the IDC database\nhas a significant technical learning curve. Recently, large language models\n(LLM) have demonstrated exceptional utility for natural language processing\ntasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate\nuser-friendly natural language cohort discovery in the IDC. Our method\ntranslates user input into IDC queries using grounding techniques and returns\nthe query's response. We evaluate Text2Cohort on 50 natural language inputs,\nfrom information extraction to cohort discovery. Our toolkit successfully\ngenerated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that\nText2Cohort can enable researchers to discover and curate cohorts on IDC with\nhigh levels of accuracy using natural language in a more intuitive and\nuser-friendly way.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2305.07637v3.pdf","comment":"5 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2311.15077v1","updated":"2023-11-25T17:05:21Z","published":"2023-11-25T17:05:21Z","title":"Multilingual self-supervised speech representations improve the speech\n recognition of low-resource African languages with codeswitching","summary":" While many speakers of low-resource languages regularly code-switch between\ntheir languages and other regional languages or English, datasets of\ncodeswitched speech are too small to train bespoke acoustic models from scratch\nor do language model rescoring. Here we propose finetuning self-supervised\nspeech representations such as wav2vec 2.0 XLSR to recognize code-switched\ndata. We find that finetuning self-supervised multilingual representations and\naugmenting them with n-gram language models trained from transcripts reduces\nabsolute word error rates by up to 20% compared to baselines of hybrid models\ntrained from scratch on code-switched data. Our findings suggest that in\ncircumstances with limited training data finetuning self-supervised\nrepresentations is a better performing and viable solution.\n","authors":["Tolúlopé Ògúnrèmí","Christopher D. Manning","Dan Jurafsky"],"pdf_url":"https://arxiv.org/pdf/2311.15077v1.pdf","comment":"5 pages, 1 figure. Computational Approaches to Linguistic\n Code-Switching, CALCS 2023 (co-located with EMNLP 2023)"},{"id":"http://arxiv.org/abs/2311.15055v1","updated":"2023-11-25T15:27:10Z","published":"2023-11-25T15:27:10Z","title":"Automatically Finding and Categorizing Replication Studies","summary":" In many fields of experimental science, papers that failed to replicate\ncontinue to be cited as a result of the poor discoverability of replication\nstudies. As a first step to creating a system that automatically finds\nreplication studies for a given paper, 334 replication studies and 344\nreplicated studies were collected. Replication studies could be identified in\nthe dataset based on text content at a higher rate than chance (AUROC = 0.886).\n Additionally, successful replication studies could be distinguished from\nfailed replication studies at a higher rate than chance (AUROC = 0.664).\n","authors":["Bob de Ruiter"],"pdf_url":"https://arxiv.org/pdf/2311.15055v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15054v1","updated":"2023-11-25T15:23:46Z","published":"2023-11-25T15:23:46Z","title":"Detection of developmental language disorder in Cypriot Greek children\n using a machine learning neural network algorithm","summary":" Children with developmental language disorder (DLD) encounter difficulties in\nacquiring various language structures. Early identification and intervention\nare crucial to prevent negative long-term outcomes impacting the academic,\nsocial, and emotional development of children. The study aims to develop an\nautomated method for the identification of DLD using artificial intelligence,\nspecifically a neural network machine learning algorithm. This protocol is\napplied for the first time in Cypriot Greek children, which is generally\nconsidered underresearched in the context of DLD. The neural network model was\ntrained using perceptual and production data elicited from children with DLD\nand healthy controls. The k-fold technique was used to crossvalidate the\nalgorithm. The performance of the model was evaluated using metrics such as\naccuracy, precision, recall, F1 score, and ROC/AUC curve to assess its ability\nto make accurate predictions on a set of unseen data. The results demonstrated\nhigh classification values for all metrics (between 0.92 and 0.98), indicating\nthe high accuracy of the neural model in classifying children with DLD.\nAdditionally, the variable importance analysis revealed that the language\nproduction skills of children had a more significant impact on the performance\nof the model compared to perception skills. Neural networks represent powerful\ntools for detecting DLD, providing early and quick assessments of the disorder,\nand having the potential to improve clinical outcomes.\n","authors":["Georgios P. Georgiou","Elena Theodorou"],"pdf_url":"https://arxiv.org/pdf/2311.15054v1.pdf","comment":"13 pages, 3 figures, journal article"},{"id":"http://arxiv.org/abs/2311.15032v1","updated":"2023-11-25T13:58:58Z","published":"2023-11-25T13:58:58Z","title":"nlpBDpatriots at BLP-2023 Task 2: A Transfer Learning Approach to Bangla\n Sentiment Analysis","summary":" In this paper, we discuss the nlpBDpatriots entry to the shared task on\nSentiment Analysis of Bangla Social Media Posts organized at the first workshop\non Bangla Language Processing (BLP) co-located with EMNLP. The main objective\nof this task is to identify the polarity of social media content using a Bangla\ndataset annotated with positive, neutral, and negative labels provided by the\nshared task organizers. Our best system for this task is a transfer learning\napproach with data augmentation which achieved a micro F1 score of 0.71. Our\nbest system ranked 12th among 30 teams that participated in the competition.\n","authors":["Dhiman Goswami","Md Nishat Raihan","Sadiya Sayara Chowdhury Puspo","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2311.15032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15029v1","updated":"2023-11-25T13:47:34Z","published":"2023-11-25T13:47:34Z","title":"nlpBDpatriots at BLP-2023 Task 1: A Two-Step Classification for Violence\n Inciting Text Detection in Bangla","summary":" In this paper, we discuss the nlpBDpatriots entry to the shared task on\nViolence Inciting Text Detection (VITD) organized as part of the first workshop\non Bangla Language Processing (BLP) co-located with EMNLP. The aim of this task\nis to identify and classify the violent threats, that provoke further unlawful\nviolent acts. Our best-performing approach for the task is two-step\nclassification using back translation and multilinguality which ranked 6th out\nof 27 teams with a macro F1 score of 0.74.\n","authors":["Md Nishat Raihan","Dhiman Goswami","Sadiya Sayara Chowdhury Puspo","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2311.15029v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.01828v2","updated":"2023-11-25T13:35:34Z","published":"2023-10-03T06:51:48Z","title":"Trainable Noise Model as an XAI evaluation method: application on Sobol\n for remote sensing image segmentation","summary":" eXplainable Artificial Intelligence (XAI) has emerged as an essential\nrequirement when dealing with mission-critical applications, ensuring\ntransparency and interpretability of the employed black box AI models. The\nsignificance of XAI spans various domains, from healthcare to finance, where\nunderstanding the decision-making process of deep learning algorithms is\nessential. Most AI-based computer vision models are often black boxes; hence,\nproviding explainability of deep neural networks in image processing is crucial\nfor their wide adoption and deployment in medical image analysis, autonomous\ndriving, and remote sensing applications. Recently, several XAI methods for\nimage classification tasks have been introduced. On the contrary, image\nsegmentation has received comparatively less attention in the context of\nexplainability, although it is a fundamental task in computer vision\napplications, especially in remote sensing. Only some research proposes\ngradient-based XAI algorithms for image segmentation. This paper adapts the\nrecent gradient-free Sobol XAI method for semantic segmentation. To measure the\nperformance of the Sobol method for segmentation, we propose a quantitative XAI\nevaluation method based on a learnable noise model. The main objective of this\nmodel is to induce noise on the explanation maps, where higher induced noise\nsignifies low accuracy and vice versa. A benchmark analysis is conducted to\nevaluate and compare performance of three XAI methods, including Seg-Grad-CAM,\nSeg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation\ntechnique. This constitutes the first attempt to run and evaluate XAI methods\nusing high-resolution satellite images.\n","authors":["Hossein Shreim","Abdul Karim Gizzini","Ali J. Ghandour"],"pdf_url":"https://arxiv.org/pdf/2310.01828v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15023v1","updated":"2023-11-25T13:27:22Z","published":"2023-11-25T13:27:22Z","title":"Offensive Language Identification in Transliterated and Code-Mixed\n Bangla","summary":" Identifying offensive content in social media is vital for creating safe\nonline communities. Several recent studies have addressed this problem by\ncreating datasets for various languages. In this paper, we explore offensive\nlanguage identification in texts with transliterations and code-mixing,\nlinguistic phenomena common in multilingual societies, and a known challenge\nfor NLP systems. We introduce TB-OLID, a transliterated Bangla offensive\nlanguage dataset containing 5,000 manually annotated comments. We train and\nfine-tune machine learning models on TB-OLID, and we evaluate their results on\nthis dataset. Our results show that English pre-trained transformer-based\nmodels, such as fBERT and HateBERT achieve the best performance on this\ndataset.\n","authors":["Md Nishat Raihan","Umma Hani Tanmoy","Anika Binte Islam","Kai North","Tharindu Ranasinghe","Antonios Anastasopoulos","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2311.15023v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18387v2","updated":"2023-11-25T13:13:01Z","published":"2023-10-27T09:59:35Z","title":"OffMix-3L: A Novel Code-Mixed Dataset in Bangla-English-Hindi for\n Offensive Language Identification","summary":" Code-mixing is a well-studied linguistic phenomenon when two or more\nlanguages are mixed in text or speech. Several works have been conducted on\nbuilding datasets and performing downstream NLP tasks on code-mixed data.\nAlthough it is not uncommon to observe code-mixing of three or more languages,\nmost available datasets in this domain contain code-mixed data from only two\nlanguages. In this paper, we introduce OffMix-3L, a novel offensive language\nidentification dataset containing code-mixed data from three different\nlanguages. We experiment with several models on this dataset and observe that\nBanglishBERT outperforms other transformer-based models and GPT-3.5.\n","authors":["Dhiman Goswami","Md Nishat Raihan","Antara Mahmud","Antonios Anastasopoulos","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2310.18387v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.18023"},{"id":"http://arxiv.org/abs/2311.15016v1","updated":"2023-11-25T12:47:39Z","published":"2023-11-25T12:47:39Z","title":"E-CORE: Emotion Correlation Enhanced Empathetic Dialogue Generation","summary":" Achieving empathy is a crucial step toward humanized dialogue systems.\nCurrent approaches for empathetic dialogue generation mainly perceive an\nemotional label to generate an empathetic response conditioned on it, which\nsimply treat emotions independently, but ignore the intrinsic emotion\ncorrelation in dialogues, resulting in inaccurate emotion perception and\nunsuitable response generation. In this paper, we propose a novel emotion\ncorrelation enhanced empathetic dialogue generation framework, which\ncomprehensively realizes emotion correlation learning, utilization, and\nsupervising. Specifically, a multi-resolution emotion graph is devised to\ncapture context-based emotion interactions from different resolutions, further\nmodeling emotion correlation. Then we propose an emotion correlation enhanced\ndecoder, with a novel correlation-aware aggregation and soft/hard strategy,\nrespectively improving the emotion perception and response generation.\nExperimental results on the benchmark dataset demonstrate the superiority of\nour model in both empathetic perception and expression.\n","authors":["Fengyi Fu","Lei Zhang","Quan Wang","Zhendong Mao"],"pdf_url":"https://arxiv.org/pdf/2311.15016v1.pdf","comment":"19 pages, 6 figures"},{"id":"http://arxiv.org/abs/2308.00802v3","updated":"2023-11-25T09:58:18Z","published":"2023-08-01T19:34:18Z","title":"GRDD: A Dataset for Greek Dialectal NLP","summary":" In this paper, we present a dataset for the computational study of a number\nof Modern Greek dialects. It consists of raw text data from four dialects of\nModern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is\nof considerable size, albeit imbalanced, and presents the first attempt to\ncreate large scale dialectal resources of this type for Modern Greek dialects.\nWe then use the dataset to perform dialect idefntification. We experiment with\ntraditional ML algorithms, as well as simple DL architectures. The results show\nvery good performance on the task, potentially revealing that the dialects in\nquestion have distinct enough characteristics allowing even simple ML models to\nperform well on the task. Error analysis is performed for the top performing\nalgorithms showing that in a number of cases the errors are due to insufficient\ndataset cleaning.\n","authors":["Stergios Chatzikyriakidis","Chatrine Qwaider","Ilias Kolokousis","Christina Koula","Dimitris Papadakis","Efthymia Sakellariou"],"pdf_url":"https://arxiv.org/pdf/2308.00802v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14966v1","updated":"2023-11-25T08:58:07Z","published":"2023-11-25T08:58:07Z","title":"Walking a Tightrope -- Evaluating Large Language Models in High-Risk\n Domains","summary":" High-risk domains pose unique challenges that require language models to\nprovide accurate and safe responses. Despite the great success of large\nlanguage models (LLMs), such as ChatGPT and its variants, their performance in\nhigh-risk domains remains unclear. Our study delves into an in-depth analysis\nof the performance of instruction-tuned LLMs, focusing on factual accuracy and\nsafety adherence. To comprehensively assess the capabilities of LLMs, we\nconduct experiments on six NLP datasets including question answering and\nsummarization tasks within two high-risk domains: legal and medical. Further\nqualitative analysis highlights the existing limitations inherent in current\nLLMs when evaluating in high-risk domains. This underscores the essential\nnature of not only improving LLM capabilities but also prioritizing the\nrefinement of domain-specific metrics, and embracing a more human-centric\napproach to enhance safety and factual reliability. Our findings advance the\nfield toward the concerns of properly evaluating LLMs in high-risk domains,\naiming to steer the adaptability of LLMs in fulfilling societal obligations and\naligning with forthcoming regulations, such as the EU AI Act.\n","authors":["Chia-Chien Hung","Wiem Ben Rim","Lindsay Frost","Lars Bruckner","Carolin Lawrence"],"pdf_url":"https://arxiv.org/pdf/2311.14966v1.pdf","comment":"EMNLP 2023 Workshop on Benchmarking Generalisation in NLP (GenBench)"},{"id":"http://arxiv.org/abs/2311.14949v1","updated":"2023-11-25T07:13:06Z","published":"2023-11-25T07:13:06Z","title":"Vector-Quantized Prompt Learning for Paraphrase Generation","summary":" Deep generative modeling of natural languages has achieved many successes,\nsuch as producing fluent sentences and translating from one language into\nanother. However, the development of generative modeling techniques for\nparaphrase generation still lags behind largely due to the challenges in\naddressing the complex conflicts between expression diversity and semantic\npreservation. This paper proposes to generate diverse and high-quality\nparaphrases by exploiting the pre-trained models with instance-dependent\nprompts. To learn generalizable prompts, we assume that the number of abstract\ntransforming patterns of paraphrase generation (governed by prompts) is finite\nand usually not large. Therefore, we present vector-quantized prompts as the\ncues to control the generation of pre-trained models. Extensive experiments\ndemonstrate that the proposed method achieves new state-of-art results on three\nbenchmark datasets, including Quora, Wikianswers, and MSCOCO. We will release\nall the code upon acceptance.\n","authors":["Haotian Luo","Yixin Liu","Peidong Liu","Xianggen Liu"],"pdf_url":"https://arxiv.org/pdf/2311.14949v1.pdf","comment":"EMNLP Findings, 2023"},{"id":"http://arxiv.org/abs/2311.14919v1","updated":"2023-11-25T03:38:14Z","published":"2023-11-25T03:38:14Z","title":"Faster Minimum Bayes Risk Decoding with Confidence-based Pruning","summary":" Minimum Bayes risk (MBR) decoding outputs the hypothesis with the highest\nexpected utility over the model distribution for some utility function. It has\nbeen shown to improve accuracy over beam search in conditional language\ngeneration problems and especially neural machine translation, in both human\nand automatic evaluations. However, the standard sampling-based algorithm for\nMBR is substantially more computationally expensive than beam search, requiring\na large number of samples as well as a quadratic number of calls to the utility\nfunction, limiting its applicability. We describe an algorithm for MBR which\ngradually grows the number of samples used to estimate the utility while\npruning hypotheses that are unlikely to have the highest utility according to\nconfidence estimates obtained with bootstrap sampling. Our method requires\nfewer samples and drastically reduces the number of calls to the utility\nfunction compared to standard MBR while being statistically indistinguishable\nin terms of accuracy. We demonstrate the effectiveness of our approach in\nexperiments on three language pairs, using chrF++ and COMET as\nutility/evaluation metrics.\n","authors":["Julius Cheng","Andreas Vlachos"],"pdf_url":"https://arxiv.org/pdf/2311.14919v1.pdf","comment":"Updated from EMNLP 2023 version: typo fix, minor math notation\n change, updated citation"},{"id":"http://arxiv.org/abs/2311.14901v1","updated":"2023-11-25T02:31:22Z","published":"2023-11-25T02:31:22Z","title":"Code Search Debiasing:Improve Search Results beyond Overall Ranking\n Performance","summary":" Code search engine is an essential tool in software development. Many code\nsearch methods have sprung up, focusing on the overall ranking performance of\ncode search. In this paper, we study code search from another perspective by\nanalyzing the bias of code search models. Biased code search engines provide\npoor user experience, even though they show promising overall performance. Due\nto different development conventions (e.g., prefer long queries or\nabbreviations), some programmers will find the engine useful, while others may\nfind it hard to get desirable search results. To mitigate biases, we develop a\ngeneral debiasing framework that employs reranking to calibrate search results.\nIt can be easily plugged into existing engines and handle new code search\nbiases discovered in the future. Experiments show that our framework can\neffectively reduce biases. Meanwhile, the overall ranking performance of code\nsearch gets improved after debiasing.\n","authors":["Sheng Zhang","Hui Li","Yanlin Wang","Zhao Wei","Yong Xiu","Juhong Wang","Rongong Ji"],"pdf_url":"https://arxiv.org/pdf/2311.14901v1.pdf","comment":"Accepted to Findings of EMNLP 2023. 11 pages"},{"id":"http://arxiv.org/abs/2310.10520v3","updated":"2023-11-25T02:09:35Z","published":"2023-10-16T15:38:02Z","title":"Semantic Parsing by Large Language Models for Intricate Updating\n Strategies of Zero-Shot Dialogue State Tracking","summary":" Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring\nand annotating task-oriented dialogues, which can be time-consuming and costly.\nHowever, DST extends beyond simple slot-filling and requires effective updating\nstrategies for tracking dialogue state as conversations progress. In this\npaper, we propose ParsingDST, a new In-Context Learning (ICL) method, to\nintroduce additional intricate updating strategies in zero-shot DST. Our\napproach reformulates the DST task by leveraging powerful Large Language Models\n(LLMs) and translating the original dialogue text to JSON through semantic\nparsing as an intermediate state. We also design a novel framework that\nincludes more modules to ensure the effectiveness of updating strategies in the\ntext-to-JSON process. Experimental results demonstrate that our approach\noutperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant\nimprovements in Joint Goal Accuracy (JGA) and slot accuracy compared to\nexisting ICL methods. Our code has been released.\n","authors":["Yuxiang Wu","Guanting Dong","Weiran Xu"],"pdf_url":"https://arxiv.org/pdf/2310.10520v3.pdf","comment":"Accepted to the Findings of EMNLP 2023 (Short Paper)"},{"id":"http://arxiv.org/abs/2308.10819v3","updated":"2023-11-25T00:25:36Z","published":"2023-08-17T06:21:50Z","title":"Evaluating the Instruction-Following Robustness of Large Language Models\n to Prompt Injection","summary":" Large Language Models (LLMs) have demonstrated exceptional proficiency in\ninstruction-following, becoming increasingly crucial across various\napplications. However, this capability brings with it the risk of prompt\ninjection attacks, where attackers inject instructions into LLMs' input to\nelicit undesirable actions or content. Understanding the robustness of LLMs\nagainst such attacks is vital for their safe implementation. In this work, we\nestablish a benchmark to evaluate the robustness of instruction-following LLMs\nagainst prompt injection attacks. Our objective is to determine the extent to\nwhich LLMs can be influenced by injected instructions and their ability to\ndifferentiate between these injected and original target instructions. Through\nextensive experiments with leading instruction-following LLMs, we uncover\nsignificant vulnerabilities in their robustness to such attacks. Our results\nindicate that some models are overly tuned to follow any embedded instructions\nin the prompt, overly focusing on the latter parts of the prompt without fully\ngrasping the entire context. By contrast, models with a better grasp of the\ncontext and instruction-following capabilities will potentially be more\nsusceptible to compromise by injected instructions. This underscores the need\nto shift the focus from merely enhancing LLMs' instruction-following\ncapabilities to improving their overall comprehension of prompts and\ndiscernment of instructions that are appropriate to follow. We hope our\nin-depth analysis offers insights into the underlying causes of these\nvulnerabilities, aiding in the development of future solutions. Code and data\nare available at\nhttps://github.com/Leezekun/instruction-following-robustness-eval\n","authors":["Zekun Li","Baolin Peng","Pengcheng He","Xifeng Yan"],"pdf_url":"https://arxiv.org/pdf/2308.10819v3.pdf","comment":"The data and code can be found at\n https://github.com/Leezekun/instruction-following-robustness-eval"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2305.07637v3","updated":"2023-11-25T17:19:36Z","published":"2023-05-12T17:46:06Z","title":"Text2Cohort: Facilitating Intuitive Access to Biomedical Data with\n Natural Language Cohort Discovery","summary":" The Imaging Data Commons (IDC) is a cloud-based database that provides\nresearchers with open access to cancer imaging data, with the goal of\nfacilitating collaboration. However, cohort discovery within the IDC database\nhas a significant technical learning curve. Recently, large language models\n(LLM) have demonstrated exceptional utility for natural language processing\ntasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate\nuser-friendly natural language cohort discovery in the IDC. Our method\ntranslates user input into IDC queries using grounding techniques and returns\nthe query's response. We evaluate Text2Cohort on 50 natural language inputs,\nfrom information extraction to cohort discovery. Our toolkit successfully\ngenerated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that\nText2Cohort can enable researchers to discover and curate cohorts on IDC with\nhigh levels of accuracy using natural language in a more intuitive and\nuser-friendly way.\n","authors":["Pranav Kulkarni","Adway Kanhere","Paul H. Yi","Vishwa S. Parekh"],"pdf_url":"https://arxiv.org/pdf/2305.07637v3.pdf","comment":"5 pages, 3 figures, 2 tables"},{"id":"http://arxiv.org/abs/2310.20189v2","updated":"2023-11-25T11:46:49Z","published":"2023-10-31T05:16:54Z","title":"LFG: A Generative Network for Real-Time Recommendation","summary":" Recommender systems are essential information technologies today, and\nrecommendation algorithms combined with deep learning have become a research\nhotspot in this field. The recommendation model known as LFM (Latent Factor\nModel), which captures latent features through matrix factorization and\ngradient descent to fit user preferences, has given rise to various\nrecommendation algorithms that bring new improvements in recommendation\naccuracy. However, collaborative filtering recommendation models based on LFM\nlack flexibility and has shortcomings for real-time recommendations, as they\nneed to redo the matrix factorization and retrain using gradient descent when\nnew users arrive. In response to this, this paper innovatively proposes a\nLatent Factor Generator (LFG) network, and set the movie recommendation as\nresearch theme. The LFG dynamically generates user latent factors through deep\nneural networks without the need for re-factorization or retrain. Experimental\nresults indicate that the LFG recommendation model outperforms traditional\nmatrix factorization algorithms in recommendation accuracy, providing an\neffective solution to the challenges of real-time recommendations with LFM.\n","authors":["Junyi Liu"],"pdf_url":"https://arxiv.org/pdf/2310.20189v2.pdf","comment":"9 pages, 1 figure, 4 tables. Source code would be uploaded to github\n soon"},{"id":"http://arxiv.org/abs/2311.14968v1","updated":"2023-11-25T08:59:45Z","published":"2023-11-25T08:59:45Z","title":"Hide Your Model: A Parameter Transmission-free Federated Recommender\n System","summary":" With the growing concerns regarding user data privacy, Federated Recommender\nSystem (FedRec) has garnered significant attention recently due to its\nprivacy-preserving capabilities. Existing FedRecs generally adhere to a\nlearning protocol in which a central server shares a global recommendation\nmodel with clients, and participants achieve collaborative learning by\nfrequently communicating the model's public parameters. Nevertheless, this\nlearning framework has two drawbacks that limit its practical usability: (1) It\nnecessitates a global-sharing recommendation model; however, in real-world\nscenarios, information related to the recommender model, including its\nalgorithm and parameters, constitutes the platforms' intellectual property.\nHence, service providers are unlikely to release such information actively. (2)\nThe communication costs of model parameter transmission are expensive since the\nmodel parameters are usually high-dimensional matrices. With the model size\nincreasing, the communication burden will be the bottleneck for such\ntraditional FedRecs.\n Given the above limitations, this paper introduces a novel parameter\ntransmission-free federated recommendation framework that balances the\nprotection between users' data privacy and platforms' model privacy, namely\nPTF-FedRec. Specifically, participants in PTF-FedRec collaboratively exchange\nknowledge by sharing their predictions within a privacy-preserving mechanism.\nThrough this way, the central server can learn a recommender model without\ndisclosing its model parameters or accessing clients' raw data, preserving both\nthe server's model privacy and users' data privacy. Besides, since clients and\nthe central server only need to communicate prediction scores which are just a\nfew real numbers, the overhead is significantly reduced compared to traditional\nFedRecs.\n","authors":["Wei Yuan","Chaoqun Yang","Liang Qu","Quoc Viet Hung Nguyen","Jianxin Li","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2311.14968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14318v3","updated":"2023-11-25T06:02:47Z","published":"2023-10-22T14:41:10Z","title":"Intent Contrastive Learning with Cross Subsequences for Sequential\n Recommendation","summary":" The user purchase behaviors are mainly influenced by their intentions (e.g.,\nbuying clothes for decoration, buying brushes for painting, etc.). Modeling a\nuser's latent intention can significantly improve the performance of\nrecommendations. Previous works model users' intentions by considering the\npredefined label in auxiliary information or introducing stochastic data\naugmentation to learn purposes in the latent space. However, the auxiliary\ninformation is sparse and not always available for recommender systems, and\nintroducing stochastic data augmentation may introduce noise and thus change\nthe intentions hidden in the sequence. Therefore, leveraging user intentions\nfor sequential recommendation (SR) can be challenging because they are\nfrequently varied and unobserved. In this paper, Intent contrastive learning\nwith Cross Subsequences for sequential Recommendation (ICSRec) is proposed to\nmodel users' latent intentions. Specifically, ICSRec first segments a user's\nsequential behaviors into multiple subsequences by using a dynamic sliding\noperation and takes these subsequences into the encoder to generate the\nrepresentations for the user's intentions. To tackle the problem of no explicit\nlabels for purposes, ICSRec assumes different subsequences with the same target\nitem may represent the same intention and proposes a coarse-grain intent\ncontrastive learning to push these subsequences closer. Then, fine-grain intent\ncontrastive learning is mentioned to capture the fine-grain intentions of\nsubsequences in sequential behaviors. Extensive experiments conducted on four\nreal-world datasets demonstrate the superior performance of the proposed ICSRec\nmodel compared with baseline methods.\n","authors":["Xiuyuan Qin","Huanhuan Yuan","Pengpeng Zhao","Guanfeng Liu","Fuzhen Zhuang","Victor S. Sheng"],"pdf_url":"https://arxiv.org/pdf/2310.14318v3.pdf","comment":"10pages, 5figures, WSDM2024. arXiv admin note: text overlap with\n arXiv:2304.07763"}],"Multimedia":[{"id":"http://arxiv.org/abs/2311.15080v1","updated":"2023-11-25T17:18:35Z","published":"2023-11-25T17:18:35Z","title":"Weakly-Supervised Audio-Visual Segmentation","summary":" Audio-visual segmentation is a challenging task that aims to predict\npixel-level masks for sound sources in a video. Previous work applied a\ncomprehensive manually designed architecture with countless pixel-wise accurate\nmasks as supervision. However, these pixel-level masks are expensive and not\navailable in all cases. In this work, we aim to simplify the supervision as the\ninstance-level annotation, i.e., weakly-supervised audio-visual segmentation.\nWe present a novel Weakly-Supervised Audio-Visual Segmentation framework,\nnamely WS-AVS, that can learn multi-scale audio-visual alignment with\nmulti-scale multiple-instance contrastive learning for audio-visual\nsegmentation. Extensive experiments on AVSBench demonstrate the effectiveness\nof our WS-AVS in the weakly-supervised audio-visual segmentation of\nsingle-source and multi-source scenarios.\n","authors":["Shentong Mo","Bhiksha Raj"],"pdf_url":"https://arxiv.org/pdf/2311.15080v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14977v1","updated":"2023-11-25T09:38:24Z","published":"2023-11-25T09:38:24Z","title":"Incorporating granularity bias as the margin into contrastive loss for\n video captioning","summary":" Video captioning models easily suffer from long-tail distribution of phrases,\nwhich makes captioning models prone to generate vague sentences instead of\naccurate ones. However, existing debiasing strategies tend to export external\nknowledge to build dependency trees of words or refine frequency distribution\nby complex losses and extra input features, which lack interpretability and are\nhard to train. To mitigate the impact of granularity bias on the model, we\nintroduced a statistical-based bias extractor. This extractor quantifies the\ninformation content within sentences and videos, providing an estimate of the\nlikelihood that a video-sentence pair is affected by granularity bias.\nFurthermore, with the growing trend of integrating contrastive learning methods\ninto video captioning tasks, we use a bidirectional triplet loss to get more\nnegative samples in a batch. Subsequently, we incorporate the margin score into\nthe contrastive learning loss, establishing distinct training objectives for\nhead and tail sentences. This approach facilitates the model's training\neffectiveness on tail samples. Our simple yet effective loss, incorporating\nGranularity bias, is referred to as the Margin-Contrastive Loss (GMC Loss). The\nproposed model demonstrates state-of-the-art performance on MSRVTT with a CIDEr\nof 57.17, and MSVD, where CIDEr reaches up to 138.68.\n","authors":["Jiayang Gu","Fengming Yao"],"pdf_url":"https://arxiv.org/pdf/2311.14977v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.08172v2","updated":"2023-11-25T07:59:48Z","published":"2023-11-14T14:02:32Z","title":"Vision-Language Instruction Tuning: A Review and Analysis","summary":" Instruction tuning is a crucial supervised training phase in Large Language\nModels (LLMs), aiming to enhance the LLM's ability to generalize instruction\nexecution and adapt to user preferences. With the increasing integration of\nmulti-modal data into LLMs, there is growing interest in Vision-Language\nInstruction Tuning (VLIT), which presents more complex characteristics compared\nto pure text instruction tuning. In this paper, we systematically review the\nlatest VLIT settings and corresponding datasets in multi-modal LLMs and provide\ninsights into the intrinsic motivations behind their design. For the first\ntime, we offer a detailed multi-perspective categorization for existing VLIT\ndatasets and identify the characteristics that high-quality VLIT data should\npossess. By incorporating these characteristics as guiding principles into the\nexisting VLIT data construction process, we conduct extensive experiments and\nverify their positive impact on the performance of tuned multi-modal LLMs.\nFurthermore, we discuss the current challenges and future research directions\nof VLIT, providing insights for the continuous development of this field. The\ncode and dataset related to this paper have been open-sourced at\nhttps://github.com/palchenli/VL-Instruction-Tuning.\n","authors":["Chen Li","Yixiao Ge","Dian Li","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2311.08172v2.pdf","comment":"34 pages, 6 figures"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 0000000000000000000000000000000000000000..7f5166c7afa0cda370aafaf91ba8d66cdeff74e5 GIT binary patch literal 15086 zcmeHO33yaRwyq9g34stc4G>5efuJa|8H50X3WB1d4n)8Z5JVvktB7$yKtv!Kb`lau zdP!$Nc9B&OSrq|+r=TLJ;K~kR2@vF;|J<9Kba%RAH(}4!8Q=syhFvI(6#Q zsX{4}Dx;b;Q+$T2oQ6t8Dy7213w{SH^#k7p^C{m4`z!S>3p8dKR#E*)@?QIEpg)}c zg{r6iyXi3T|KFt>>TsDWWe%JEGeI;m)t_ z#K0v35xKK9{I2==#4YqlUA%3Xh?MkH#4d|P#d8&Xs_&s!ylR8}jrLpHV^;bsWZSYa zH!TUx_B8jZuJHA{>W61Pj6tR~6PcNrNKa44y2nHqn=Z;*^b+JZFPz5nh)ES%qX=#z&q(>zlfC!?&@YSrplEEcre-mb8` zuXubZFMZU1X@6u`GYT;qc;sp5g4h(J-Ri$pM?zXcp{_ZWm%PAnhXUA$>^4n&yX>&2hmV*Ry0tB zDc2*Jr;1N^DfmP%o?7)3+HY@<`rp+@r%jzP%QCA_hD^#Z(KZo(JM=fG&)C8vq&l}j zJwF&~<0nw3PebMBv;9AHx_+HHM>2k2y$bcquTUQ>g6h^CsupeGV1<^;S|Zt!?1a7Z z#?J81^LLBW9d_fL={n^r&=JYsxAQ)IUfZ*@bwK9T6E8jvRhMmd?FO}(eVmu4AjvwHnm*c+SZaI_{G2dio}E$k=@I4b)RlA>{TDjCf@P9^3fXP4&AYX4kyP&|*&u zL_Z&m!0N<4WeU`#OS-PO!zb88j|}~hyr;2|GQa;071I}ibplcL)3QG6j4NJuzfD_B zC=*%^D*iPcyDJ`}Kd)TT$K}u=sD1mO_U@%EJplFd&rlaHy4N$2?^n)?N2!-$3x0El zpcL=UvTj#WH?}W2Bm5luU4G~0LM>eiHE+x9X!aoTcDDYxjsAu_3X1y$Bwy|VK(IpqYlX$aVwJaeLK?Nm$>FoH>BT1SA+z+ zU=vI)(6%2wtjj0%MZP(b^zV%uI__S*bQz3sFxr#nAAdmI&v6?@p6=F4Uu9a$c0#M_ zYms0O{AbSSdL;Y>%a9?ohv$p;r=yM;d1*uX{*gzhC!9-CPjph+lrr*t+6=DYwBtv~ zyR_+Lw$R}Ly?yB);gOO8)vups_cUD>9g)5^F#gq3Fz(vLe!d?nI$Cc_+LU_I&i?)M zIch^KE+zVltlyC|I^G%I@#DH)3(vSXsLPkV$8N|bbwuZ+4Uu2klyA~U=gyHYb#inm z@gHOTMt<~hZ2FpM@D?7T<1$AF4AA)*-@JVaMzK}WhO}jjts%pUoNtelKmDVdPWxH2 zUPby@A3Nh09x~3tJ0qiLUVDpO%84zIy3&TL?uk4TCquO+|J<8Kuls0W!BE?_7q^=R zR>LM4G8ykZJsq(+)^#i|_{CpsPVA>jf&X*X4WnPYb(?4W24BGk+NDH$2J`E zf`8mZuHWQ;fpoJ;{E)k7hv&^N8NXnQkB3QdfAN5etrc7{H)-GHn^uNpi|M>0e#!TL zUf<(*vuE`rpX~9(?-?@G**>`PqAzOd-d)F5zrMZ>JLy9R#yRq)UYk01*0I&Dt@{+N_~~bu_)WvlvL5G&ri-7^ z->MF^<`&@J!8Sr^LszWytV3LjOg($**cvs`*CSW_T%%0(R| z7fJph+p5D@i1_zn8yvAoUifk!XnKY+uH-m5_PtS7-tn7OM)r*E%1GHa#zNgmoALcE z#4q!>Kk2^KMLx2D%Xms3%l^vv?dd6HJdB}Qx1PEh0yX;DK9ZoqOLJHETi*8l?M+!q*#o zC6zI-cj$nK1`W|ZyFL7`-F)mM@LV85j)p*DxN;%mZkENXqy&csb zsD_E}O+J!}{T|s1i|rJAB99{pc8M?U{DQvC+q9AQaBsmro~7V_${$@ebz$tS zD0VAxxY*^fnl6;TP##r}S4F+U^$_|)D9TLDq~u-ur`m{J5sTMs zuVdouiO8@OoM~_@pI-BHF}c0L>wncBZML_?6w4IMOrMFU?NJb4z)d@OeCL^Ns63wI}D zBv}ESDELfpe#mbj`F_{^oZh>Z^W}G7ZeV`Y?soZMN5fs)bg~^4FI2?vWy3K&V>+50 zU`*>4B=PI|ujbte-Xj>la6GD=q@VASBzQbE(Mou3Mu)rQ&jISBtLywuzSE(YL* zRWHzQDO&Hv*NR_EzfbO-(Ql3ZI2xH2{XUVbt8zbPa`t3YJ<0PO*Cc+F#GZd3NgVut zNOB${@er3J{oZq9PuOfW&DRq@LnzyljUgWmDclQ1?vKPg+dRz&)b3@`ACx*V>tj&< zQh{G_jjc<}=nbyZ(YJA*)|_Wd4{~4Ik$L+3y{kb@W`7DE$|U3Y$hJq3Zlm8!pKa`- zv1q-oHJX3j0-ZkZ?7)E-Ue)=q) z1McS8?eE9wOY)5BEYBN$`Hivk&!HwUHvc$vb{y}ht!;-idz!|3=!&7Jc7pgyNTLgZ zL&}654a0gZHLP z#fT3_pz0|%<5&U~4Z|;Ccy3ZZ)RDUXWm zuW1r%$T_63N0z;(oDoczz`f8=o@319zR0eVoPCet-frwzJsvA%1%sSn_Upy_AU<-J z`Se6X?t8y0>UX)zFl-nU@1AMxy7s@^n~*a*Gj(PbecRHl3 z)RAxUQWpboZ1o5#BsAxUFp)gfC+3&x=I=7_IiV()^N6pLIfV24e(_kM$kW2W1{+TyObG z%18SuI2`rMz##ABo2)s!CwhC^NWA&#Ye>jRK*XU4w{bZGSA|Oz&~HsYEykEkKg?pY zXufJvMiN@@Z_T@=EZKu=$l!rI`}>%4?++b|p?{Y+CP#nP?M%$mP|pP*d{r3U&v{>q zi@lfm9`4_JKPsK8gz6`1fO`u_9Kqm!&w+bjW;{vaQU+P+dv)2-r6{&=nx(CzzLj}D zrv-UL^G+<+sG)xM6 z?TJFFEmf0C{C}YwOAfki>*iPzQOx^KY$zohMj$PFW zCBJEte(!S2disF0r>^Ov+jX9@#tC1!$1G3B{B^EFJGawD(vNmcn=A5eJ=^~ChPFSD z`yJZd2HtPb^0MEMZ)+A3XVDr_*beuF%KQ@h`>O7b$(~E@NRv#Gm$SfJ`dkb8=(^#^ zpZemTUj}!jMxoMTuBTTxc8|>VvOmu-_V5+=E%E4@&Z01YYUJsU+fLnv2}>u?uI6Cm+L8KM zAc2-+N1Mj9EDb0eKE% zBWGqVy3+V)V + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 58 + +
+
+
+ + ☆ How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for + Vision LLMs SC + + +
+ This work focuses on the potential of Vision LLMs (VLLMs) in visual +reasoning. Different from prior studies, we shift our focus from evaluating +standard performance to introducing a comprehensive safety evaluation suite, +covering both out-of-distribution (OOD) generalization and adversarial +robustness. For the OOD evaluation, we present two novel VQA datasets, each +with one variant, designed to test model performance under challenging +conditions. In exploring adversarial robustness, we propose a straightforward +attack strategy for misleading VLLMs to produce visual-unrelated responses. +Moreover, we assess the efficacy of two jailbreaking strategies, targeting +either the vision or language component of VLLMs. Our evaluation of 21 diverse +models, ranging from open-source VLLMs to GPT-4V, yields interesting +observations: 1) Current VLLMs struggle with OOD texts but not images, unless +the visual information is limited; and 2) These VLLMs can be easily misled by +deceiving vision encoders only, and their vision-language training often +compromise safety protocols. We release this safety evaluation suite at +https://github.com/UCSC-VLAA/vllm-safety-benchmark. + +
+
+ comment: H.T., C.C., and Z.W. contribute equally. Work done during H.T. and + Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC +
+
+
+
+
+ + ☆ DUnE: Dataset for Unified Editing EMNLP 2023 + + +
+ Even the most advanced language models remain susceptible to errors +necessitating to modify these models without initiating a comprehensive +retraining process. Model editing refers to the modification of a model's +knowledge or representations in a manner that produces the desired outcomes. +Prior research primarily centered around editing factual data e.g. "Messi plays +for Inter Miami" confining the definition of an edit to a knowledge triplet +i.e. (subject, object, relation). However, as the applications of language +models expand, so do the diverse ways in which we wish to edit and refine their +outputs. In this study, we broaden the scope of the editing problem to include +an array of editing cases such as debiasing and rectifying reasoning errors and +define an edit as any natural language expression that solicits a change in the +model's outputs. We are introducing DUnE-an editing benchmark where edits are +natural language sentences and propose that DUnE presents a challenging yet +relevant task. To substantiate this claim, we conduct an extensive series of +experiments testing various editing approaches to address DUnE, demonstrating +their respective strengths and weaknesses. We show that retrieval-augmented +language modeling can outperform specialized editing techniques and neither set +of approaches has fully solved the generalized editing problem covered by our +benchmark. + +
+
+ comment: Accepted at EMNLP 2023 +
+
+
+
+
+ + ☆ BERT Goes Off-Topic: Investigating the Domain Transfer Challenge using + Genre Classification EMNLP'2023 + + +
+ While performance of many text classification tasks has been recently +improved due to Pre-trained Language Models (PLMs), in this paper we show that +they still suffer from a performance gap when the underlying distribution of +topics changes. For example, a genre classifier trained on \textit{political} +topics often fails when tested on documents about \textit{sport} or +\textit{medicine}. In this work, we quantify this phenomenon empirically with a +large corpus and a large set of topics. Consequently, we verify that domain +transfer remains challenging both for classic PLMs, such as BERT, and for +modern large models, such as GPT-3. We also suggest and successfully test a +possible remedy: after augmenting the training dataset with +topically-controlled synthetic texts, the F1 score improves by up to 50\% for +some topics, nearing on-topic training results, while others show little to no +improvement. While our empirical results focus on genre classification, our +methodology is applicable to other classification tasks such as gender, +authorship, or sentiment classification. The code and data to replicate the +experiments are available at https://github.com/dminus1/genre + +
+
+ comment: Published at EMNLP'2023 +
+
+
+
+
+ + ☆ MEDITRON-70B: Scaling Medical Pretraining for Large Language Models + + +
+ Large language models (LLMs) can potentially democratize access to medical +knowledge. While many efforts have been made to harness and improve LLMs' +medical knowledge and reasoning capacities, the resulting models are either +closed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters), +which restricts their abilities. In this work, we improve access to large-scale +medical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B +parameters adapted to the medical domain. MEDITRON builds on Llama-2 (through +our adaptation of Nvidia's Megatron-LM distributed trainer), and extends +pretraining on a comprehensively curated medical corpus, including selected +PubMed articles, abstracts, and internationally-recognized medical guidelines. +Evaluations using four major medical benchmarks show significant performance +gains over several state-of-the-art baselines before and after task-specific +finetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the +best public baseline in its parameter class and 3% over the strongest baseline +we finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B +outperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of +Med-PaLM-2. We release our code for curating the medical pretraining corpus and +the MEDITRON model weights to drive open-source development of more capable +medical LLMs. + +
+
+
+
+
+ + ☆ BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical + Knowledge Graph Insights + + +
+ In this study, we investigate the potential of Large Language Models to +complement biomedical knowledge graphs in the training of semantic models for +the biomedical and clinical domains. Drawing on the wealth of the UMLS +knowledge graph and harnessing cutting-edge Large Language Models, we propose a +new state-of-the-art approach for obtaining high-fidelity representations of +biomedical concepts and sentences, consisting of three steps: an improved +contrastive learning phase, a novel self-distillation phase, and a weight +averaging phase. Through rigorous evaluations via the extensive BioLORD testing +suite and diverse downstream tasks, we demonstrate consistent and substantial +performance improvements over the previous state of the art (e.g. +2pts on +MedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new +state-of-the-art biomedical model for English, we also distill and release a +multilingual model compatible with 50+ languages and finetuned on 7 European +languages. Many clinical pipelines can benefit from our latest models. Our new +multilingual model enables a range of languages to benefit from our +advancements in biomedical semantic representation learning, opening a new +avenue for bioinformatics researchers around the world. As a result, we hope to +see BioLORD-2023 becoming a precious tool for future biomedical applications. + +
+
+ comment: Preprint of upcoming journal article +
+
+
+
+
+ + ☆ Sparsify-then-Classify: From Internal Neurons of Large Language Models + To Efficient Text Classifiers + + +
+ Among the many tasks that Large Language Models (LLMs) have revolutionized is +text classification. However, existing approaches for applying pretrained LLMs +to text classification predominantly rely on using single token outputs from +only the last layer of hidden states. As a result, they suffer from limitations +in efficiency, task-specificity, and interpretability. In our work, we +contribute an approach that uses all internal representations by employing +multiple pooling strategies on all activation and hidden states. Our novel +lightweight strategy, Sparsify-then-Classify (STC) first sparsifies +task-specific features layer-by-layer, then aggregates across layers for text +classification. STC can be applied as a seamless plug-and-play module on top of +existing LLMs. Our experiments on a comprehensive set of models and datasets +demonstrate that STC not only consistently improves the classification +performance of pretrained and fine-tuned models, but is also more efficient for +both training and inference, and is more intrinsically interpretable. + +
+
+ comment: 23 pages, 5 figures, 8 tables Code available at + https://github.com/difanj0713/Sparsify-then-Classify +
+
+
+
+
+ + ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+
+
+
+ + ☆ A Quantitative Approach to Understand Self-Supervised Models as + Cross-lingual Feature Extractors + + +
+ In this work, we study the features extracted by English self-supervised +learning (SSL) models in cross-lingual contexts and propose a new metric to +predict the quality of feature representations. Using automatic speech +recognition (ASR) as a downstream task, we analyze the effect of model size, +training objectives, and model architecture on the models' performance as a +feature extractor for a set of topologically diverse corpora. We develop a +novel metric, the Phonetic-Syntax Ratio (PSR), to measure the phonetic and +synthetic information in the extracted representations using deep generalized +canonical correlation analysis. Results show the contrastive loss in the +wav2vec2.0 objective facilitates more effective cross-lingual feature +extraction. There is a positive correlation between PSR scores and ASR +performance, suggesting that phonetic information extracted by monolingual SSL +models can be used for downstream tasks in cross-lingual settings. The proposed +metric is an effective indicator of the quality of the representations and can +be useful for model selection. + +
+
+ comment: 12 pages, 5 figures, 4 tables +
+
+
+
+
+ + ☆ Leveraging deep active learning to identify low-resource mobility + functioning information in public clinical notes + + +
+ Function is increasingly recognized as an important indicator of whole-person +health, although it receives little attention in clinical natural language +processing research. We introduce the first public annotated dataset +specifically on the Mobility domain of the International Classification of +Functioning, Disability and Health (ICF), aiming to facilitate automatic +extraction and analysis of functioning information from free-text clinical +notes. We utilize the National NLP Clinical Challenges (n2c2) research dataset +to construct a pool of candidate sentences using keyword expansion. Our active +learning approach, using query-by-committee sampling weighted by density +representativeness, selects informative sentences for human annotation. We +train BERT and CRF models, and use predictions from these models to guide the +selection of new sentences for subsequent annotation iterations. Our final +dataset consists of 4,265 sentences with a total of 11,784 entities, including +5,511 Action entities, 5,328 Mobility entities, 306 Assistance entities, and +639 Quantification entities. The inter-annotator agreement (IAA), averaged over +all entity types, is 0.72 for exact matching and 0.91 for partial matching. We +also train and evaluate common BERT models and state-of-the-art Nested NER +models. The best F1 scores are 0.84 for Action, 0.7 for Mobility, 0.62 for +Assistance, and 0.71 for Quantification. Empirical results demonstrate +promising potential of NER models to accurately extract mobility functioning +information from clinical text. The public availability of our annotated +dataset will facilitate further research to comprehensively capture functioning +information in electronic health records (EHRs). + +
+
+
+
+
+ + ☆ Tell2Design: A Dataset for Language-Guided Floor Plan Generation ACL2023 + + +
+ We consider the task of generating designs directly from natural language +descriptions, and consider floor plan generation as the initial research area. +Language conditional generative models have recently been very successful in +generating high-quality artistic images. However, designs must satisfy +different constraints that are not present in generating artistic images, +particularly spatial and relational constraints. We make multiple contributions +to initiate research on this task. First, we introduce a novel dataset, +\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs +associated with natural language instructions. Second, we propose a +Sequence-to-Sequence model that can serve as a strong baseline for future +research. Third, we benchmark this task with several text-conditional image +generation models. We conclude by conducting human evaluations on the generated +samples and providing an analysis of human performance. We hope our +contributions will propel the research on language-guided design generation +forward. + +
+
+ comment: Paper published in ACL2023; Area Chair Award; Best Paper Nomination +
+
+
+
+
+ + ☆ WorldSense: A Synthetic Benchmark for Grounded Reasoning in Large + Language Models + + +
+ We propose WorldSense, a benchmark designed to assess the extent to which +LLMs are consistently able to sustain tacit world models, by testing how they +draw simple inferences from descriptions of simple arrangements of entities. +Worldsense is a synthetic benchmark with three problem types, each with their +own trivial control, which explicitly avoids bias by decorrelating the abstract +structure of problems from the vocabulary and expressions, and by decorrelating +all problem subparts with the correct response. We run our benchmark on three +state-of-the-art chat-LLMs (GPT3.5, GPT4 and Llama2-chat) and show that these +models make errors even with as few as three objects. Furthermore, they have +quite heavy response biases, preferring certain responses irrespective of the +question. Errors persist even with chain-of-thought prompting and in-context +learning. Lastly, we show that while finetuning on similar problems does result +in substantial improvements -- within- and out-of-distribution -- the finetuned +models do not generalise beyond a constraint problem space. + +
+
+
+
+
+ + ☆ Data Generation for Post-OCR correction of Cyrillic handwriting + + +
+ This paper introduces a novel approach to post-Optical Character Recognition +Correction (POC) for handwritten Cyrillic text, addressing a significant gap in +current research methodologies. This gap is due to the lack of large text +corporas that provide OCR errors for further training of language-based POC +models, which are demanding in terms of corpora size. Our study primarily +focuses on the development and application of a synthetic handwriting +generation engine based on B\'ezier curves. Such an engine generates highly +realistic handwritten text in any amounts, which we utilize to create a +substantial dataset by transforming Russian text corpora sourced from the +internet. We apply a Handwritten Text Recognition (HTR) model to this dataset +to identify OCR errors, forming the basis for our POC model training. The +correction model is trained on a 90-symbol input context, utilizing a +pre-trained T5 architecture with a seq2seq correction task. We evaluate our +approach on HWR200 and School_notebooks_RU datasets as they provide significant +challenges in the HTR domain. Furthermore, POC can be used to highlight errors +for teachers, evaluating student performance. This can be done simply by +comparing sentences before and after correction, displaying differences in +text. Our primary contribution lies in the innovative use of B\'ezier curves +for Cyrillic text generation and subsequent error correction using a +specialized POC model. We validate our approach by presenting Word Accuracy +Rate (WAR) and Character Accuracy Rate (CAR) results, both with and without +post-OCR correction, using real open corporas of handwritten Cyrillic text. +These results, coupled with our methodology, are designed to be reproducible, +paving the way for further advancements in the field of OCR and handwritten +text analysis. Paper contributions can be found in +https://github.com/dbrainio/CyrillicHandwritingPOC + +
+
+ comment: 17 pages, 27 figures, 6 tables, 26 references +
+
+
+
+
+ + ☆ YUAN 2.0: A Large Language Model with Localized Filtering-based + Attention + + +
+ In this work, the Localized Filtering-based Attention (LFA) is introduced to +incorporate prior knowledge of local dependencies of natural language into +Attention. Based on LFA, we develop and release Yuan 2.0, a large language +model with parameters ranging from 2.1 billion to 102.6 billion. A data +filtering and generation method is presented to build pretraining and +fine-tuning dataset in high quality. A distributed training method with +non-uniform pipeline parallel, data parallel, and optimizer parallel is +proposed, which greatly reduces the bandwidth requirements of intra-node +communication, and achieves good performance in large-scale distributed +training. Yuan 2.0 models display impressive ability in code generation, math +problem-solving, and chat compared with existing models. The latest version of +YUAN 2.0, including model weights and source code, is accessible at Github. + +
+
+
+
+
+ + ☆ Increasing Coverage and Precision of Textual Information in Multilingual + Knowledge Graphs EMNLP 2023 + + +
+ Recent work in Natural Language Processing and Computer Vision has been using +textual information -- e.g., entity names and descriptions -- available in +knowledge graphs to ground neural models to high-quality structured data. +However, when it comes to non-English languages, the quantity and quality of +textual information are comparatively scarce. To address this issue, we +introduce the novel task of automatic Knowledge Graph Enhancement (KGE) and +perform a thorough investigation on bridging the gap in both the quantity and +quality of textual information between English and non-English languages. More +specifically, we: i) bring to light the problem of increasing multilingual +coverage and precision of entity names and descriptions in Wikidata; ii) +demonstrate that state-of-the-art methods, namely, Machine Translation (MT), +Web Search (WS), and Large Language Models (LLMs), struggle with this task; +iii) present M-NTA, a novel unsupervised approach that combines MT, WS, and +LLMs to generate high-quality textual information; and, iv) study the impact of +increasing multilingual coverage and precision of non-English textual +information in Entity Linking, Knowledge Graph Completion, and Question +Answering. As part of our effort towards better multilingual knowledge graphs, +we also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE +approaches in 10 languages across 7 language families. + +
+
+ comment: Camera ready for EMNLP 2023 +
+
+
+
+
+ + ☆ Knowledge Unlearning for LLMs: Tasks, Methods, and Challenges + + +
+ In recent years, large language models (LLMs) have spurred a new research +paradigm in natural language processing. Despite their excellent capability in +knowledge-based question answering and reasoning, their potential to retain +faulty or even harmful knowledge poses risks of malicious application. The +challenge of mitigating this issue and transforming these models into purer +assistants is crucial for their widespread applicability. Unfortunately, +Retraining LLMs repeatedly to eliminate undesirable knowledge is impractical +due to their immense parameters. Knowledge unlearning, derived from analogous +studies on machine unlearning, presents a promising avenue to address this +concern and is notably advantageous in the context of LLMs. It allows for the +removal of harmful knowledge in an efficient manner, without affecting +unrelated knowledge in the model. To this end, we provide a survey of knowledge +unlearning in the era of LLMs. Firstly, we formally define the knowledge +unlearning problem and distinguish it from related works. Subsequently, we +categorize existing knowledge unlearning methods into three classes: those +based on parameter optimization, parameter merging, and in-context learning, +and introduce details of these unlearning methods. We further present +evaluation datasets used in existing methods, and finally conclude this survey +by presenting the ongoing challenges and future directions. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage + and Sharing in LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have achieved +significant multimodal generation capabilities, akin to GPT-4. These models +predominantly map visual information into language representation space, +leveraging the vast knowledge and powerful text generation abilities of LLMs to +produce multimodal instruction-following responses. We could term this method +as LLMs for Vision because of its employing LLMs for visual-language +understanding, yet observe that these MLLMs neglect the potential of harnessing +visual knowledge to enhance overall capabilities of LLMs, which could be +regraded as Vision Enhancing LLMs. In this paper, we propose an approach called +MKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage +and Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a +component integrated into the internal blocks of LLMs, designed to store +open-world visual information efficiently. Additionally, we present a soft +Mixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal +knowledge collaboration during generation. Our comprehensive experiments +demonstrate that MKS2 substantially augments the reasoning capabilities of LLMs +in contexts necessitating physical or commonsense knowledge. It also delivers +competitive results on multimodal benchmarks. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ Italian Crossword Generator: Enhancing Education through Interactive + Word Puzzles + + +
+ Educational crosswords offer numerous benefits for students, including +increased engagement, improved understanding, critical thinking, and memory +retention. Creating high-quality educational crosswords can be challenging, but +recent advances in natural language processing and machine learning have made +it possible to use language models to generate nice wordplays. The exploitation +of cutting-edge language models like GPT3-DaVinci, GPT3-Curie, GPT3-Babbage, +GPT3-Ada, and BERT-uncased has led to the development of a comprehensive system +for generating and verifying crossword clues. A large dataset of clue-answer +pairs was compiled to fine-tune the models in a supervised manner to generate +original and challenging clues from a given keyword. On the other hand, for +generating crossword clues from a given text, Zero/Few-shot learning techniques +were used to extract clues from the input text, adding variety and creativity +to the puzzles. We employed the fine-tuned model to generate data and labeled +the acceptability of clue-answer parts with human supervision. To ensure +quality, we developed a classifier by fine-tuning existing language models on +the labeled dataset. Conversely, to assess the quality of clues generated from +the given text using zero/few-shot learning, we employed a zero-shot learning +approach to check the quality of generated clues. The results of the evaluation +have been very promising, demonstrating the effectiveness of the approach in +creating high-standard educational crosswords that offer students engaging and +rewarding learning experiences. + +
+
+ comment: Accepted Paper for CLiC-it 2023 - 9th Italian Conference on + Computational Linguistics +
+
+
+
+
+ + ☆ Justifiable Artificial Intelligence: Engineering Large Language Models + for Legal Applications + + +
+ In this work, I discuss how Large Language Models can be applied in the legal +domain, circumventing their current drawbacks. Despite their large success and +acceptance, their lack of explainability hinders legal experts to trust in +their output, and this happens rightfully so. However, in this paper, I argue +in favor of a new view, Justifiable Artificial Intelligence, instead of +focusing on Explainable Artificial Intelligence. I discuss in this paper how +gaining evidence for and against a Large Language Model's output may make their +generated texts more trustworthy - or hold them accountable for misinformation. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Cerbero-7B: A Leap Forward in Language-Specific LLMs Through Enhanced + Chat Corpus Generation and Evaluation + + +
+ This study introduces a novel approach for generating high-quality, +language-specific chat corpora using a self-chat mechanism. We combine a +generator LLM for creating new samples and an embedder LLM to ensure diversity. +A new Masked Language Modelling (MLM) model-based quality assessment metric is +proposed for evaluating and filtering the corpora. Utilizing the llama2-70b as +the generator and a multilingual sentence transformer as embedder, we generate +an Italian chat corpus and refine the Fauno corpus, which is based on +translated English ChatGPT self-chat data. The refinement uses structural +assertions and Natural Language Processing techniques. Both corpora undergo a +comprehensive quality evaluation using the proposed MLM model-based quality +metric. The Italian LLM fine-tuned with these corpora demonstrates +significantly enhanced language comprehension and question-answering skills. +The resultant model, cerbero-7b, establishes a new state-of-the-art for Italian +LLMs. This approach marks a substantial advancement in the development of +language-specific LLMs, with a special emphasis on augmenting corpora for +underrepresented languages like Italian. + +
+
+
+
+
+ + ☆ MoDS: Model-oriented Data Selection for Instruction Tuning + + +
+ Instruction tuning has become the de facto method to equip large language +models (LLMs) with the ability of following user instructions. Usually, +hundreds of thousands or millions of instruction-following pairs are employed +to fine-tune the foundation LLMs. Recently, some studies show that a small +number of high-quality instruction data is enough. However, how to select +appropriate instruction data for a given LLM is still an open problem. To +address this problem, in this paper we present a model-oriented data selection +(MoDS) approach, which selects instruction data based on a new criteria +considering three aspects: quality, coverage and necessity. First, our approach +utilizes a quality evaluation model to filter out the high-quality subset from +the original instruction dataset, and then designs an algorithm to further +select from the high-quality subset a seed instruction dataset with good +coverage. The seed dataset is applied to fine-tune the foundation LLM to obtain +an initial instruction-following LLM. Finally, we develop a necessity +evaluation model to find out the instruction data which are performed badly in +the initial instruction-following LLM and consider them necessary instructions +to further improve the LLMs. In this way, we can get a small high-quality, +broad-coverage and high-necessity subset from the original instruction +datasets. Experimental results show that, the model fine-tuned with 4,000 +instruction pairs selected by our approach could perform better than the model +fine-tuned with the full original dataset which includes 214k instruction data. + +
+
+
+
+
+ + ☆ Reinforcement Learning from Diffusion Feedback: Q* for Image Search + + +
+ Large vision-language models are steadily gaining personalization +capabilities at the cost of fine-tuning or data augmentation. We present two +models for image generation using model-agnostic learning that align semantic +priors with generative capabilities. RLDF, or Reinforcement Learning from +Diffusion Feedback, is a singular approach for visual imitation through +prior-preserving reward function guidance. This employs Q-learning (with +standard Q*) for generation and follows a semantic-rewarded trajectory for +image search through finite encoding-tailored actions. The second proposed +method, noisy diffusion gradient, is optimization driven. At the root of both +methods is a special CFG encoding that we propose for continual semantic +guidance. Using only a single input image and no text input, RLDF generates +high-quality images over varied domains including retail, sports and +agriculture showcasing class-consistency and strong visual diversity. Project +website is available at https://infernolia.github.io/RLDF. + +
+
+
+
+
+ + ☆ InfoPattern: Unveiling Information Propagation Patterns in Social Media + + +
+ Social media play a significant role in shaping public opinion and +influencing ideological communities through information propagation. Our demo +InfoPattern centers on the interplay between language and human ideology. The +demo (Code: https://github.com/blender-nlp/InfoPattern ) is capable of: (1) red +teaming to simulate adversary responses from opposite ideology communities; (2) +stance detection to identify the underlying political sentiments in each +message; (3) information propagation graph discovery to reveal the evolution of +claims across various communities over time. (Live Demo: +https://incas.csl.illinois.edu/blender/About ) + +
+
+
+
+
+ + ☆ The WebCrow French Crossword Solver + + +
+ Crossword puzzles are one of the most popular word games, played in different +languages all across the world, where riddle style can vary significantly from +one country to another. Automated crossword resolution is challenging, and +typical solvers rely on large databases of previously solved crosswords. In +this work, we extend WebCrow 2.0, an automatic crossword solver, to French, +making it the first program for crossword solving in the French language. To +cope with the lack of a large repository of clue-answer crossword data, WebCrow +2.0 exploits multiple modules, called experts, that retrieve candidate answers +from heterogeneous resources, such as the web, knowledge graphs, and linguistic +rules. We compared WebCrow's performance against humans in two different +challenges. Despite the limited amount of past crosswords, French WebCrow was +competitive, actually outperforming humans in terms of speed and accuracy, thus +proving its capabilities to generalize to new languages. + +
+
+ comment: Accepted Paper for EAI Intetain 2023 - 14th EAI International + Conference on Intelligent Technologies for Interactive Entertainment +
+
+
+
+
+ + ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +processes lack transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +necessitate annotations or additional training data. The injection of the +extracted knowledge necessitates the addition of only simple neural modules. We +employ the Convex Polytopic Model (CPM) as a feature extraction tool for DST +tasks and illustrate that the acquired features correlate with the syntactic +and semantic patterns in the dialogues. This correlation facilitates a +comprehensive understanding of the linguistic features influencing the DST +model's decision-making process. We benchmark this framework on various DST +tasks and observe a notable improvement in accuracy. + +
+
+
+
+
+ + ☆ FreeAL: Towards Human-Free Active Learning in the Era of Large Language + Models EMNLP 2023 + + +
+ Collecting high-quality labeled data for model training is notoriously +time-consuming and labor-intensive for various NLP tasks. While copious +solutions, such as active learning for small language models (SLMs) and +prevalent in-context learning in the era of large language models (LLMs), have +been proposed and alleviate the labeling burden to some extent, their +performances are still subject to human intervention. It is still underexplored +how to reduce the annotation cost in the LLMs era. To bridge this, we +revolutionize traditional active learning and propose an innovative +collaborative learning framework FreeAL to interactively distill and filter the +task-specific knowledge from LLMs. During collaborative training, an LLM serves +as an active annotator inculcating its coarse-grained knowledge, while a +downstream SLM is incurred as a student to filter out high-quality in-context +samples to feedback LLM for the subsequent label refinery. Extensive +experiments on eight benchmark datasets demonstrate that FreeAL largely +enhances the zero-shot performances for both SLM and LLM without any human +supervision. The code is available at https://github.com/Justherozen/FreeAL . + +
+
+ comment: Accepted to EMNLP 2023 (Main conference) +
+
+
+
+
+ + ☆ Can Vision-Language Models Think from a First-Person Perspective? + + +
+ Vision-language models (VLMs) have recently shown promising results in +traditional downstream tasks. Evaluation studies have emerged to assess their +abilities, with the majority focusing on the third-person perspective, and only +a few addressing specific tasks from the first-person perspective. However, the +capability of VLMs to "think" from a first-person perspective, a crucial +attribute for advancing autonomous agents and robotics, remains largely +unexplored. To bridge this research gap, we introduce EgoThink, a novel visual +question-answering benchmark that encompasses six core capabilities with twelve +detailed dimensions. The benchmark is constructed using selected clips from +egocentric videos, with manually annotated question-answer pairs containing +first-person information. To comprehensively assess VLMs, we evaluate eighteen +popular VLMs on EgoThink. Moreover, given the open-ended format of the answers, +we use GPT-4 as the automatic judge to compute single-answer grading. +Experimental results indicate that although GPT-4V leads in numerous +dimensions, all evaluated VLMs still possess considerable potential for +improvement in first-person perspective tasks. Meanwhile, enlarging the number +of trainable parameters has the most significant impact on model performance on +EgoThink. In conclusion, EgoThink serves as a valuable addition to existing +evaluation benchmarks for VLMs, providing an indispensable resource for future +research in the realm of embodied artificial intelligence and robotics. + +
+
+
+
+
+ + ☆ SpotServe: Serving Generative Large Language Models on Preemptible + Instances ASPLOS 2024 + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them cheaply. This paper aims to +reduce the monetary cost for serving LLMs by leveraging preemptible GPU +instances on modern clouds, which offer accesses to spare GPUs at a much +cheaper price than regular instances but may be preempted by the cloud at any +time. Serving LLMs on preemptible instances requires addressing challenges +induced by frequent instance preemptions and the necessity of migrating +instances to handle these preemptions. + This paper presents SpotServe, the first distributed LLM serving system on +preemptible instances. Several key techniques in SpotServe realize fast and +reliable serving of generative LLMs on cheap preemptible instances. First, +SpotServe dynamically adapts the LLM parallelization configuration for dynamic +instance availability and fluctuating workload, while balancing the trade-off +among the overall throughput, inference latency and monetary costs. Second, to +minimize the cost of migrating instances for dynamic reparallelization, the +task of migrating instances is formulated as a bipartite graph matching +problem, which uses the Kuhn-Munkres algorithm to identify an optimal migration +plan that minimizes communications. Finally, to take advantage of the grace +period offered by modern clouds, we introduce stateful inference recovery, a +new inference mechanism that commits inference progress at a much finer +granularity and allows SpotServe to cheaply resume inference upon preemption. +We evaluate on real spot instance preemption traces and various popular LLMs +and show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared +with the best existing LLM serving systems. We also show that SpotServe can +leverage the price advantage of preemptive instances, saving 54% monetary cost +compared with only using on-demand instances. + +
+
+ comment: ASPLOS 2024 +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing + AI-Generated Text + + +
+ My research investigates the use of cutting-edge hybrid deep learning models +to accurately differentiate between AI-generated text and human writing. I +applied a robust methodology, utilising a carefully selected dataset comprising +AI and human texts from various sources, each tagged with instructions. +Advanced natural language processing techniques facilitated the analysis of +textual features. Combining sophisticated neural networks, the custom model +enabled it to detect nuanced differences between AI and human content. + +
+
+
+
+
+ + ☆ Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval EMNLP 2023 + + +
+ Neural 'dense' retrieval models are state of the art for many datasets, +however these models often exhibit limited domain transfer ability. Existing +approaches to adaptation are unwieldy, such as requiring explicit supervision, +complex model architectures, or massive external models. We present +$\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage +retrieval in zero-shot settings. Our technique follows a straightforward loop: +a dense retriever learns from supervision signals provided by a reranker, and +subsequently, the reranker is updated based on feedback from the improved +retriever. By iterating this loop, the two components mutually enhance one +another's performance. Experimental results demonstrate that our unsupervised +$\texttt{ABEL}$ model outperforms both leading supervised and unsupervised +retrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation +abilities to tasks and domains that were unseen during training. By either +fine-tuning $\texttt{ABEL}$ on labelled data or integrating it with existing +supervised dense retrievers, we achieve state-of-the-art +results.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/BootSwitch}.} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Noisy Self-Training with Synthetic Queries for Dense Retrieval EMNLP 2023 + + +
+ Although existing neural retrieval models reveal promising results when +training data is abundant and the performance keeps improving as training data +increases, collecting high-quality annotated data is prohibitively costly. To +this end, we introduce a novel noisy self-training framework combined with +synthetic queries, showing that neural retrievers can be improved in a +self-evolution manner with no reliance on any external models. Experimental +results show that our method improves consistently over existing methods on +both general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval +benchmarks. Extra analysis on low-resource settings reveals that our method is +data efficient and outperforms competitive baselines, with as little as 30% of +labelled training data. Further extending the framework for reranker training +demonstrates that the proposed method is general and yields additional gains on +tasks of diverse domains.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/Self-Training-DPR}} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Deficiency of Large Language Models in Finance: An Empirical Examination + of Hallucination + + +
+ The hallucination issue is recognized as a fundamental deficiency of large +language models (LLMs), especially when applied to fields such as finance, +education, and law. Despite the growing concerns, there has been a lack of +empirical investigation. In this paper, we provide an empirical examination of +LLMs' hallucination behaviors in financial tasks. First, we empirically +investigate LLM model's ability of explaining financial concepts and +terminologies. Second, we assess LLM models' capacity of querying historical +stock prices. Third, to alleviate the hallucination issue, we evaluate the +efficacy of four practical methods, including few-shot learning, Decoding by +Contrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method +and the prompt-based tool learning method for a function to generate a query +command. Finally, our major finding is that off-the-shelf LLMs experience +serious hallucination behaviors in financial tasks. Therefore, there is an +urgent need to call for research efforts in mitigating LLMs' hallucination. + +
+
+
+
+
+ + ☆ The effect of source disclosure on evaluation of AI-generated messages: + A two-part study + + +
+ Advancements in artificial intelligence (AI) over the last decade demonstrate +that machines can exhibit communicative behavior and influence how humans +think, feel, and behave. In fact, the recent development of ChatGPT has shown +that large language models (LLMs) can be leveraged to generate high-quality +communication content at scale and across domains, suggesting that they will be +increasingly used in practice. However, many questions remain about how knowing +the source of the messages influences recipients' evaluation of and preference +for AI-generated messages compared to human-generated messages. This paper +investigated this topic in the context of vaping prevention messaging. In Study +1, which was pre-registered, we examined the influence of source disclosure on +people's evaluation of AI-generated health prevention messages compared to +human-generated messages. We found that source disclosure (i.e., labeling the +source of a message as AI vs. human) significantly impacted the evaluation of +the messages but did not significantly alter message rankings. In a follow-up +study (Study 2), we examined how the influence of source disclosure may vary by +the participants' negative attitudes towards AI. We found a significant +moderating effect of negative attitudes towards AI on message evaluation, but +not for message selection. However, for those with moderate levels of negative +attitudes towards AI, source disclosure decreased the preference for +AI-generated messages. Overall, the results of this series of studies showed a +slight bias against AI-generated messages once the source was disclosed, adding +to the emerging area of study that lies at the intersection of AI and +communication. + +
+
+ comment: Manuscript currently under review. Paper presented at 109th Annual + National Communication Association (NCA) Conference, November 16-19, 2023. 10 + pages, 5 figures +
+
+
+
+
+ + ☆ Overview of the VLSP 2022 -- Abmusu Shared Task: A Data Challenge for + Vietnamese Abstractive Multi-document Summarization SP 2022 + + +
+ This paper reports the overview of the VLSP 2022 - Vietnamese abstractive +multi-document summarization (Abmusu) shared task for Vietnamese News. This +task is hosted at the 9$^{th}$ annual workshop on Vietnamese Language and +Speech Processing (VLSP 2022). The goal of Abmusu shared task is to develop +summarization systems that could create abstractive summaries automatically for +a set of documents on a topic. The model input is multiple news documents on +the same topic, and the corresponding output is a related abstractive summary. +In the scope of Abmusu shared task, we only focus on Vietnamese news +summarization and build a human-annotated dataset of 1,839 documents in 600 +clusters, collected from Vietnamese news in 8 categories. Participated models +are evaluated and ranked in terms of \texttt{ROUGE2-F1} score, the most typical +evaluation metric for document summarization problem. + +
+
+ comment: VLSP 2022 +
+
+
+
+
+ + ☆ A Comparative and Experimental Study on Automatic Question Answering + Systems and its Robustness against Word Jumbling + + +
+ Question answer generation using Natural Language Processing models is +ubiquitous in the world around us. It is used in many use cases such as the +building of chat bots, suggestive prompts in google search and also as a way of +navigating information in banking mobile applications etc. It is highly +relevant because a frequently asked questions (FAQ) list can only have a finite +amount of questions but a model which can perform question answer generation +could be able to answer completely new questions that are within the scope of +the data. This helps us to be able to answer new questions accurately as long +as it is a relevant question. In commercial applications, it can be used to +increase customer satisfaction and ease of usage. However a lot of data is +generated by humans so it is susceptible to human error and this can adversely +affect the model's performance and we are investigating this through our work + +
+
+
+
+
+ + ☆ A Corpus for Named Entity Recognition in Chinese Novels with + Multi-genres + + +
+ Entities like person, location, organization are important for literary text +analysis. The lack of annotated data hinders the progress of named entity +recognition (NER) in literary domain. To promote the research of literary NER, +we build the largest multi-genre literary NER corpus containing 263,135 +entities in 105,851 sentences from 260 online Chinese novels spanning 13 +different genres. Based on the corpus, we investigate characteristics of +entities from different genres. We propose several baseline NER models and +conduct cross-genre and cross-domain experiments. Experimental results show +that genre difference significantly impact NER performance though not as much +as domain difference like literary domain and news domain. Compared with NER in +news domain, literary NER still needs much improvement and the +Out-of-Vocabulary (OOV) problem is more challenging due to the high variety of +entities in literary works. + +
+
+
+
+
+ + ☆ Improving Word Sense Disambiguation in Neural Machine Translation with + Salient Document Context + + +
+ Lexical ambiguity is a challenging and pervasive problem in machine +translation (\mt). We introduce a simple and scalable approach to resolve +translation ambiguity by incorporating a small amount of extra-sentential +context in neural \mt. Our approach requires no sense annotation and no change +to standard model architectures. Since actual document context is not available +for the vast majority of \mt training data, we collect related sentences for +each input to construct pseudo-documents. Salient words from pseudo-documents +are then encoded as a prefix to each source sentence to condition the +generation of the translation. To evaluate, we release \docmucow, a challenge +set for translation disambiguation based on the English-German \mucow +\cite{raganato-etal-2020-evaluation} augmented with document IDs. Extensive +experiments show that our method translates ambiguous source words better than +strong sentence-level baselines and comparable document-level baselines while +reducing training costs. + +
+
+
+
+
+ + ☆ Function-constrained Program Synthesis NeurIPS + + +
+ This work introduces (1) a technique that allows large language models (LLMs) +to leverage user-provided code when solving programming tasks and (2) a method +to iteratively generate modular sub-functions that can aid future code +generation attempts when the initial code generated by the LLM is inadequate. +Generating computer programs in general-purpose programming languages like +Python poses a challenge for LLMs when instructed to use code provided in the +prompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code +completions in real-time by drawing on all code available in a development +environment. However, restricting code-specific LLMs to use only in-context +code is not straightforward, as the model is not explicitly instructed to use +the user-provided code and users cannot highlight precisely which snippets of +code the model should incorporate into its context. Moreover, current systems +lack effective recovery methods, forcing users to iteratively re-prompt the +model with modified prompts until a sufficient solution is reached. Our method +differs from traditional LLM-powered code-generation by constraining +code-generation to an explicit function set and enabling recovery from failed +attempts through automatically generated sub-functions. When the LLM cannot +produce working code, we generate modular sub-functions to aid subsequent +attempts at generating functional code. A by-product of our method is a library +of reusable sub-functions that can solve related tasks, imitating a software +team where efficiency scales with experience. We also introduce a new +"half-shot" evaluation paradigm that provides tighter estimates of LLMs' coding +abilities compared to traditional zero-shot evaluation. Our proposed evaluation +method encourages models to output solutions in a structured format, decreasing +syntax errors that can be mistaken for poor coding ability. + +
+
+ comment: 17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop +
+
+
+
+
+ + ☆ Optimizing and Fine-tuning Large Language Model for Urban Renewal + + +
+ This study aims to innovatively explore adaptive applications of large +language models (LLM) in urban renewal. It also aims to improve its performance +and text generation quality for knowledge question-answering (QA) tasks. Based +on the ChatGLM, we automatically generate QA datasets using urban renewal +scientific literature corpora in a self-instruct manner and then conduct joint +fine-tuning training on the model using the Prefix and LoRA fine-tuning methods +to create an LLM for urban renewal. By guiding the LLM to automatically +generate QA data based on prompt words and given text, it is possible to +quickly obtain datasets in the urban renewal field and provide data support for +the fine-tuning training of LLMs. The experimental results show that the joint +fine-tuning training method proposed in this study can significantly improve +the performance of LLM on the QA tasks. Compared with LoRA fine-tuning, the +method improves the Bleu and Rouge metrics on the test by about 5%; compared +with the model before fine-tuning, the method improves the Bleu and Rouge +metrics by about 15%-20%. This study demonstrates the effectiveness and +superiority of the joint fine-tuning method using Prefix and LoRA for ChatGLM +in the urban renewal knowledge QA tasks. It provides a new approach for +fine-tuning LLMs on urban renewal-related tasks. + +
+
+ comment: 11 pages, 2 figures, 2 tables, 41 references +
+
+
+
+
+ + ☆ Automatic Time Signature Determination for New Scores Using Lyrics for + Latent Rhythmic Structure + + +
+ There has recently been a sharp increase in interest in Artificial +Intelligence-Generated Content (AIGC). Despite this, musical components such as +time signatures have not been studied sufficiently to form an algorithmic +determination approach for new compositions, especially lyrical songs. This is +likely because of the neglect of musical details, which is critical for +constructing a robust framework. Specifically, time signatures establish the +fundamental rhythmic structure for almost all aspects of a song, including the +phrases and notes. In this paper, we propose a novel approach that only uses +lyrics as input to automatically generate a fitting time signature for lyrical +songs and uncover the latent rhythmic structure utilizing explainable machine +learning models. In particular, we devise multiple methods that are associated +with discovering lyrical patterns and creating new features that simultaneously +contain lyrical, rhythmic, and statistical information. In this approach, the +best of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under +the Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In +conclusion, our research directly generates time signatures from lyrics +automatically for new scores utilizing machine learning, which is an innovative +idea that approaches an understudied component of musicology and therefore +contributes significantly to the future of Artificial Intelligence (AI) music +generation. + +
+
+ comment: Submitted to IEEE Big Data 2023 Conference +
+
+
+
+
+ + ♻ ☆ Neuradicon: operational representation learning of neuroimaging reports + + +
+ Radiological reports typically summarize the content and interpretation of +imaging studies in unstructured form that precludes quantitative analysis. This +limits the monitoring of radiological services to throughput undifferentiated +by content, impeding specific, targeted operational optimization. Here we +present Neuradicon, a natural language processing (NLP) framework for +quantitative analysis of neuroradiological reports. Our framework is a hybrid +of rule-based and artificial intelligence models to represent neurological +reports in succinct, quantitative form optimally suited to operational +guidance. We demonstrate the application of Neuradicon to operational +phenotyping of a corpus of 336,569 reports, and report excellent +generalizability across time and two independent healthcare institutions. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Evaluating the Robustness to Instructions of Large Language Models + + +
+ Recently, Instruction fine-tuning has risen to prominence as a potential +method for enhancing the zero-shot capabilities of Large Language Models (LLMs) +on novel tasks. This technique has shown an exceptional ability to boost the +performance of moderately sized LLMs, sometimes even reaching performance +levels comparable to those of much larger model variants. The focus is on the +robustness of instruction-tuned LLMs to seen and unseen tasks. We conducted an +exploration of six models including Alpaca, Vicuna, WizardLM, and Traditional +Task-oriented Models(Flan-T5-XL/XXL, T0++) using real-world relation extraction +datasets as case studies. We carried out a comprehensive evaluation of these +instruction-following LLMs which have been tuned based on open-domain +instructions and task-oriented instructions. The main discussion is their +performance and robustness towards instructions. We have observed that in most +cases, the model's performance in dealing with unfamiliar instructions tends to +worsen significantly, and the robustness of the model for RE instructions +deteriorates compared to QA. Further, we discovered that up until a certain +parameter size threshold (3B), the performance of the FLAN-T5 model improves as +the parameter count increases. The robustness of different scales of FLAN-T5 +models to RE instruction is worse than the robustness to QA instruction. + +
+
+ comment: There were major problems with the experimental data +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40\% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ Average Token Delay: A Duration-aware Latency Metric for Simultaneous + Translation INTERSPEECH 2023 + + +
+ Simultaneous translation is a task in which the translation begins before the +end of an input speech segment. Its evaluation should be conducted based on +latency in addition to quality, and for users, the smallest possible amount of +latency is preferable. Most existing metrics measure latency based on the start +timings of partial translations and ignore their duration. This means such +metrics do not penalize the latency caused by long translation output, which +delays the comprehension of users and subsequent translations. In this work, we +propose a novel latency evaluation metric for simultaneous translation called +\emph{Average Token Delay} (ATD) that focuses on the duration of partial +translations. We demonstrate its effectiveness through analyses simulating +user-side latency based on Ear-Voice Span (EVS). In our experiment, ATD had the +highest correlation with EVS among baseline latency metrics under most +conditions. + +
+
+ comment: Extended version of the paper (doi: 10.21437/Interspeech.2023-933) + which appeared in INTERSPEECH 2023 +
+
+
+
+
+ + ♻ ☆ Technical Report: Large Language Models can Strategically Deceive their + Users when Put Under Pressure + + +
+ We demonstrate a situation in which Large Language Models, trained to be +helpful, harmless, and honest, can display misaligned behavior and +strategically deceive their users about this behavior without being instructed +to do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated +environment, where it assumes the role of an autonomous stock trading agent. +Within this environment, the model obtains an insider tip about a lucrative +stock trade and acts upon it despite knowing that insider trading is +disapproved of by company management. When reporting to its manager, the model +consistently hides the genuine reasons behind its trading decision. We perform +a brief investigation of how this behavior varies under changes to the setting, +such as removing model access to a reasoning scratchpad, attempting to prevent +the misaligned behavior by changing system instructions, changing the amount of +pressure the model is under, varying the perceived risk of getting caught, and +making other simple changes to the environment. To our knowledge, this is the +first demonstration of Large Language Models trained to be helpful, harmless, +and honest, strategically deceiving their users in a realistic situation +without direct instructions or training for deception. + +
+
+
+
+
+ + ♻ ☆ Self-Evolution Learning for Mixup: Enhance Data Augmentation on Few-Shot + Text Classification Tasks + + +
+ Text classification tasks often encounter few shot scenarios with limited +labeled data, and addressing data scarcity is crucial. Data augmentation with +mixup has shown to be effective on various text classification tasks. However, +most of the mixup methods do not consider the varying degree of learning +difficulty in different stages of training and generate new samples with one +hot labels, resulting in the model over confidence. In this paper, we propose a +self evolution learning (SE) based mixup approach for data augmentation in text +classification, which can generate more adaptive and model friendly pesudo +samples for the model training. SE focuses on the variation of the model's +learning ability. To alleviate the model confidence, we introduce a novel +instance specific label smoothing approach, which linearly interpolates the +model's output and one hot labels of the original samples to generate new soft +for label mixing up. Through experimental analysis, in addition to improving +classification accuracy, we demonstrate that SE also enhances the model's +generalize ability. + +
+
+
+
+
+ + ♻ ☆ RCT Rejection Sampling for Causal Estimation Evaluation + + +
+ Confounding is a significant obstacle to unbiased estimation of causal +effects from observational data. For settings with high-dimensional covariates +-- such as text data, genomics, or the behavioral social sciences -- +researchers have proposed methods to adjust for confounding by adapting machine +learning methods to the goal of causal estimation. However, empirical +evaluation of these adjustment methods has been challenging and limited. In +this work, we build on a promising empirical evaluation strategy that +simplifies evaluation design and uses real data: subsampling randomized +controlled trials (RCTs) to create confounded observational datasets while +using the average causal effects from the RCTs as ground-truth. We contribute a +new sampling algorithm, which we call RCT rejection sampling, and provide +theoretical guarantees that causal identification holds in the observational +data to allow for valid comparisons to the ground-truth RCT. Using synthetic +data, we show our algorithm indeed results in low bias when oracle estimators +are evaluated on the confounded samples, which is not always the case for a +previously proposed algorithm. In addition to this identification result, we +highlight several finite data considerations for evaluation designers who plan +to use RCT rejection sampling on their own datasets. As a proof of concept, we +implement an example evaluation pipeline and walk through these finite data +considerations with a novel, real-world RCT -- which we release publicly -- +consisting of approximately 70k observations and text data as high-dimensional +covariates. Together, these contributions build towards a broader agenda of +improved empirical evaluation for causal estimation. + +
+
+ comment: Code and data at https://github.com/kakeith/rct_rejection_sampling +
+
+
+
+
+ + ♻ ☆ Sentiment analysis with adaptive multi-head attention in Transformer + + +
+ We propose a novel framework based on the attention mechanism to identify the +sentiment of a movie review document. Previous efforts on deep neural networks +with attention mechanisms focus on encoder and decoder with fixed numbers of +multi-head attention. Therefore, we need a mechanism to stop the attention +process automatically if no more useful information can be read from the +memory.In this paper, we propose an adaptive multi-head attention architecture +(AdaptAttn) which varies the number of attention heads based on length of +sentences. AdaptAttn has a data preprocessing step where each document is +classified into any one of the three bins small, medium or large based on +length of the sentence. The document classified as small goes through two heads +in each layer, the medium group passes four heads and the large group is +processed by eight heads. We examine the merit of our model on the Stanford +large movie review dataset. The experimental results show that the F1 score +from our model is on par with the baseline model. + +
+
+ comment: Accepted by the 4th International Conference on Signal Processing and + Machine Learning +
+
+
+
+
+ + ♻ ☆ PACuna: Automated Fine-Tuning of Language Models for Particle + Accelerators + + +
+ Navigating the landscape of particle accelerators has become increasingly +challenging with recent surges in contributions. These intricate devices +challenge comprehension, even within individual facilities. To address this, we +introduce PACuna, a fine-tuned language model refined through publicly +available accelerator resources like conferences, pre-prints, and books. We +automated data collection and question generation to minimize expert +involvement and make the data publicly available. PACuna demonstrates +proficiency in addressing intricate accelerator questions, validated by +experts. Our approach shows adapting language models to scientific domains by +fine-tuning technical texts and auto-generated corpora capturing the latest +developments can further produce pre-trained models to answer some intricate +questions that commercially available assistants cannot and can serve as +intelligent assistants for individual facilities. + +
+
+
+
+
+ + ♻ ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation + + +
+ Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced +significant growth and have been extensively employed to adapt large vision and +language models to various domains, enabling satisfactory model performance +with minimal computational needs. Despite these advances, more research has yet +to delve into potential PEFT applications in real-life scenarios, particularly +in the critical domains of remote sensing and crop monitoring. The diversity of +climates across different regions and the need for comprehensive large-scale +datasets have posed significant obstacles to accurately identify crop types +across varying geographic locations and changing growing seasons. This study +seeks to bridge this gap by comprehensively exploring the feasibility of +cross-area and cross-year out-of-distribution generalization using the +State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to +explore PEFT approaches for crop monitoring. Specifically, we focus on adapting +the SOTA TSViT model to address winter wheat field segmentation, a critical +task for crop monitoring and food security. This adaptation process involves +integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and +prompt tuning. Using PEFT techniques, we achieved notable results comparable to +those achieved using full fine-tuning methods while training only a mere 0.7% +parameters of the whole TSViT architecture. The in-house labeled data-set, +referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated +polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over +five consecutive years. Using Sentinel-2 images, our model achieved a 84% +F1-score. We intend to publicly release the Lebanese winter wheat data set, +code repository, and model weights. + +
+
+
+
+
+ + ♻ ☆ Large Language Models for Propaganda Detection + + +
+ The prevalence of propaganda in our digital society poses a challenge to +societal harmony and the dissemination of truth. Detecting propaganda through +NLP in text is challenging due to subtle manipulation techniques and contextual +dependencies. To address this issue, we investigate the effectiveness of modern +Large Language Models (LLMs) such as GPT-3 and GPT-4 for propaganda detection. +We conduct experiments using the SemEval-2020 task 11 dataset, which features +news articles labeled with 14 propaganda techniques as a multi-label +classification problem. Five variations of GPT-3 and GPT-4 are employed, +incorporating various prompt engineering and fine-tuning strategies across the +different models. We evaluate the models' performance by assessing metrics such +as $F1$ score, $Precision$, and $Recall$, comparing the results with the +current state-of-the-art approach using RoBERTa. Our findings demonstrate that +GPT-4 achieves comparable results to the current state-of-the-art. Further, +this study analyzes the potential and challenges of LLMs in complex tasks like +propaganda detection. + +
+
+
+
+
+ + ♻ ☆ Knowledge Graphs for the Life Sciences: Recent Developments, Challenges + and Opportunities + + +
+ The term life sciences refers to the disciplines that study living organisms +and life processes, and include chemistry, biology, medicine, and a range of +other related disciplines. Research efforts in life sciences are heavily +data-driven, as they produce and consume vast amounts of scientific data, much +of which is intrinsically relational and graph-structured. + The volume of data and the complexity of scientific concepts and relations +referred to therein promote the application of advanced knowledge-driven +technologies for managing and interpreting data, with the ultimate aim to +advance scientific discovery. + In this survey and position paper, we discuss recent developments and +advances in the use of graph-based technologies in life sciences and set out a +vision for how these technologies will impact these fields into the future. We +focus on three broad topics: the construction and management of Knowledge +Graphs (KGs), the use of KGs and associated technologies in the discovery of +new knowledge, and the use of KGs in artificial intelligence applications to +support explanations (explainable AI). We select a few exemplary use cases for +each topic, discuss the challenges and open research questions within these +topics, and conclude with a perspective and outlook that summarizes the +overarching challenges and their potential solutions as a guide for future +research. + +
+
+ comment: 33 pages, 1 figure, accepted for Transactions on Graph Data and + Knowledge (TGDK) +
+
+
+
+
+ + ♻ ☆ Towards Codable Watermarking for Injecting Multi-bit Information to LLM + + +
+ As large language models (LLMs) generate texts with increasing fluency and +realism, there is a growing need to identify the source of texts to prevent the +abuse of LLMs. Text watermarking techniques have proven reliable in +distinguishing whether a text is generated by LLMs by injecting hidden patterns +into the generated texts. However, we argue that existing watermarking methods +for LLMs are encoding-inefficient (only contain one bit of information - +whether it is generated from an LLM or not) and cannot flexibly meet the +diverse information encoding needs (such as encoding model version, generation +time, user id, etc.) in different LLMs application scenarios. In this work, we +conduct the first systematic study on the topic of Codable Text Watermarking +for LLMs (CTWL) that allows text watermarks to carry more customizable +information. First of all, we study the taxonomy of LLM watermarking technology +and give a mathematical formulation for CTWL. Additionally, we provide a +comprehensive evaluation system for CTWL: (1) watermarking success rate, (2) +robustness against various corruptions, (3) coding rate of payload information, +(4) encoding and decoding efficiency, (5) impacts on the quality of the +generated text. To meet the requirements of these non-Pareto-improving metrics, +we devise a CTWL method named Balance-Marking, based on the motivation of +ensuring that available and unavailable vocabularies for encoding information +have approximately equivalent probabilities. Compared to the random vocabulary +partitioning extended from the existing work, a probability-balanced vocabulary +partition can significantly improve the quality of the generated text. +Extensive experimental results have shown that our method outperforms a direct +baseline under comprehensive evaluation. + +
+
+
+
+
+ + ♻ ☆ Token-Level Adversarial Prompt Detection Based on Perplexity Measures + and Contextual Information + + +
+ In recent years, Large Language Models (LLM) have emerged as pivotal tools in +various applications. However, these models are susceptible to adversarial +prompt attacks, where attackers can carefully curate input strings that lead to +undesirable outputs. The inherent vulnerability of LLMs stems from their +input-output mechanisms, especially when presented with intensely +out-of-distribution (OOD) inputs. This paper proposes a token-level detection +method to identify adversarial prompts, leveraging the LLM's capability to +predict the next token's probability. We measure the degree of the model's +perplexity and incorporate neighboring token information to encourage the +detection of contiguous adversarial prompt sequences. As a result, we propose +two methods: one that identifies each token as either being part of an +adversarial prompt or not, and another that estimates the probability of each +token being part of an adversarial prompt. + +
+
+
+
+
+ + ♻ ☆ Ring Attention with Blockwise Transformers for Near-Infinite Context + + +
+ Transformers have emerged as the architecture of choice for many +state-of-the-art AI models, showcasing exceptional performance across a wide +range of AI applications. However, the memory demands imposed by Transformers +limit their ability to handle long sequences, thereby posing challenges in +utilizing videos, actions, and other long-form sequences and modalities in +complex environments. We present a novel approach, Ring Attention with +Blockwise Transformers (Ring Attention), which leverages blockwise computation +of self-attention and feedforward to distribute long sequences across multiple +devices while fully overlapping the communication of key-value blocks with the +computation of blockwise attention. Our approach enables training and inference +of sequences that are up to device count times longer than those achievable by +prior memory-efficient Transformers, without resorting to approximations or +incurring additional communication and computation overheads. Extensive +experiments on language modeling and reinforcement learning tasks demonstrate +the effectiveness of our approach in allowing millions of tokens context size +and improving performance. + +
+
+ comment: Code: https://github.com/lhao499/llm_large_context +
+
+
+
+
+ + ♻ ☆ TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression + For On-device ASR Models ICASSP 2024 + + +
+ Automatic Speech Recognition (ASR) models need to be optimized for specific +hardware before they can be deployed on devices. This can be done by tuning the +model's hyperparameters or exploring variations in its architecture. +Re-training and re-validating models after making these changes can be a +resource-intensive task. This paper presents TODM (Train Once Deploy Many), a +new approach to efficiently train many sizes of hardware-friendly on-device ASR +models with comparable GPU-hours to that of a single training job. TODM +leverages insights from prior work on Supernet, where Recurrent Neural Network +Transducer (RNN-T) models share weights within a Supernet. It reduces layer +sizes and widths of the Supernet to obtain subnetworks, making them smaller +models suitable for all hardware types. We introduce a novel combination of +three techniques to improve the outcomes of the TODM Supernet: adaptive +dropouts, an in-place Alpha-divergence knowledge distillation, and the use of +ScaledAdam optimizer. We validate our approach by comparing Supernet-trained +versus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using +LibriSpeech. Results demonstrate that our TODM Supernet either matches or +surpasses the performance of manually tuned models by up to a relative of 3% +better in word error rate (WER), while efficiently keeping the cost of training +many models at a small constant. + +
+
+ comment: Meta AI; Submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ WordArt Designer: User-Driven Artistic Typography Synthesis using Large + Language Models EMNLP 2023 + + +
+ This paper introduces WordArt Designer, a user-driven framework for artistic +typography synthesis, relying on the Large Language Model (LLM). The system +incorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo +modules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets +user inputs and generates actionable prompts for the other modules, thereby +transforming abstract concepts into tangible designs. 2) The SemTypo module +optimizes font designs using semantic concepts, striking a balance between +artistic transformation and readability. 3) Building on the semantic layout +provided by the SemTypo module, the StyTypo module creates smooth, refined +images. 4) The TexTypo module further enhances the design's aesthetics through +texture rendering, enabling the generation of inventive textured fonts. +Notably, WordArt Designer highlights the fusion of generative AI with artistic +typography. Experience its capabilities on ModelScope: +https://www.modelscope.cn/studios/WordArt/WordArt. + +
+
+ comment: Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is + at https://www.modelscope.cn/studios/WordArt/WordArt +
+
+
+
+
+ + ♻ ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose a novel method which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging (namely LM-Cocktail), where the fine-tuned language model +is merged with the pre-trained base model or the peer models from other domains +through weighted average. Despite simplicity, LM-Cocktail is surprisingly +effective: the resulted model is able to achieve a strong empirical performance +in the whole scope of general tasks while preserving a superior capacity in its +targeted domain. We conduct comprehensive experiments with LLama and BGE model +on popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail. + +
+
+
+
+
+ + ♻ ☆ How to Prompt LLMs for Text-to-SQL: A Study in Zero-shot, Single-domain, + and Cross-domain Settings + + +
+ Large language models (LLMs) with in-context learning have demonstrated +remarkable capability in the text-to-SQL task. Previous research has prompted +LLMs with various demonstration-retrieval strategies and intermediate reasoning +steps to enhance the performance of LLMs. However, those works often employ +varied strategies when constructing the prompt text for text-to-SQL inputs, +such as databases and demonstration examples. This leads to a lack of +comparability in both the prompt constructions and their primary contributions. +Furthermore, selecting an effective prompt construction has emerged as a +persistent problem for future research. To address this limitation, we +comprehensively investigate the impact of prompt constructions across various +settings and provide insights into prompt constructions for future text-to-SQL +studies. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Video-Bench: A Comprehensive Benchmark and Toolkit for Evaluating + Video-based Large Language Models + + +
+ Video-based large language models (Video-LLMs) have been recently introduced, +targeting both fundamental improvements in perception and comprehension, and a +diverse range of user inquiries. In pursuit of the ultimate goal of achieving +artificial general intelligence, a truly intelligent Video-LLM model should not +only see and understand the surroundings, but also possess human-level +commonsense, and make well-informed decisions for the users. To guide the +development of such a model, the establishment of a robust and comprehensive +evaluation system becomes crucial. To this end, this paper proposes +\textit{Video-Bench}, a new comprehensive benchmark along with a toolkit +specifically designed for evaluating Video-LLMs. The benchmark comprises 10 +meticulously crafted tasks, evaluating the capabilities of Video-LLMs across +three distinct levels: Video-exclusive Understanding, Prior Knowledge-based +Question-Answering, and Comprehension and Decision-making. In addition, we +introduce an automatic toolkit tailored to process model outputs for various +tasks, facilitating the calculation of metrics and generating convenient final +scores. We evaluate 8 representative Video-LLMs using \textit{Video-Bench}. The +findings reveal that current Video-LLMs still fall considerably short of +achieving human-like comprehension and analysis of real-world videos, offering +valuable insights for future research directions. The benchmark and toolkit are +available at: \url{https://github.com/PKU-YuanGroup/Video-Bench}. + +
+
+ comment: Benchmark is available at + https://github.com/PKU-YuanGroup/Video-Bench +
+
+
+
+
+ + ☆ Test-time Adaptation of Discriminative Models via Diffusion Generative + Feedback NeurIPS 2023 + + +
+ The advancements in generative modeling, particularly the advent of diffusion +models, have sparked a fundamental question: how can these models be +effectively used for discriminative tasks? In this work, we find that +generative models can be great test-time adapters for discriminative models. +Our method, Diffusion-TTA, adapts pre-trained discriminative models such as +image classifiers, segmenters and depth predictors, to each unlabelled example +in the test set using generative feedback from a diffusion model. We achieve +this by modulating the conditioning of the diffusion model using the output of +the discriminative model. We then maximize the image likelihood objective by +backpropagating the gradients to discriminative model's parameters. We show +Diffusion-TTA significantly enhances the accuracy of various large-scale +pre-trained discriminative models, such as, ImageNet classifiers, CLIP models, +image pixel labellers and image depth predictors. Diffusion-TTA outperforms +existing test-time adaptation methods, including TTT-MAE and TENT, and +particularly shines in online adaptation setups, where the discriminative model +is continually adapted to each example in the test set. We provide access to +code, results, and visualizations on our website: +https://diffusion-tta.github.io/. + +
+
+ comment: Accepted at NeurIPS 2023 Webpage with Code: + https://diffusion-tta.github.io/ +
+
+
+
+
+ + ☆ How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for + Vision LLMs SC + + +
+ This work focuses on the potential of Vision LLMs (VLLMs) in visual +reasoning. Different from prior studies, we shift our focus from evaluating +standard performance to introducing a comprehensive safety evaluation suite, +covering both out-of-distribution (OOD) generalization and adversarial +robustness. For the OOD evaluation, we present two novel VQA datasets, each +with one variant, designed to test model performance under challenging +conditions. In exploring adversarial robustness, we propose a straightforward +attack strategy for misleading VLLMs to produce visual-unrelated responses. +Moreover, we assess the efficacy of two jailbreaking strategies, targeting +either the vision or language component of VLLMs. Our evaluation of 21 diverse +models, ranging from open-source VLLMs to GPT-4V, yields interesting +observations: 1) Current VLLMs struggle with OOD texts but not images, unless +the visual information is limited; and 2) These VLLMs can be easily misled by +deceiving vision encoders only, and their vision-language training often +compromise safety protocols. We release this safety evaluation suite at +https://github.com/UCSC-VLAA/vllm-safety-benchmark. + +
+
+ comment: H.T., C.C., and Z.W. contribute equally. Work done during H.T. and + Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC +
+
+
+
+
+ + ☆ GART: Gaussian Articulated Template Models + + +
+ We introduce Gaussian Articulated Template Model GART, an explicit, +efficient, and expressive representation for non-rigid articulated subject +capturing and rendering from monocular videos. GART utilizes a mixture of +moving 3D Gaussians to explicitly approximate a deformable subject's geometry +and appearance. It takes advantage of a categorical template model prior (SMPL, +SMAL, etc.) with learnable forward skinning while further generalizing to more +complex non-rigid deformations with novel latent bones. GART can be +reconstructed via differentiable rendering from monocular videos in seconds or +minutes and rendered in novel poses faster than 150fps. + +
+
+ comment: 13 pages, code available at + https://www.cis.upenn.edu/~leijh/projects/gart/ +
+
+
+
+
+ + ☆ On Bringing Robots Home + + +
+ Throughout history, we have successfully integrated various machines into our +homes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few +recent examples. However, these machines excel at performing only a single task +effectively. The concept of a "generalist machine" in homes - a domestic +assistant that can adapt and learn from our needs, all while remaining +cost-effective - has long been a goal in robotics that has been steadily +pursued for decades. In this work, we initiate a large-scale effort towards +this goal by introducing Dobb-E, an affordable yet versatile general-purpose +system for learning robotic manipulation within household settings. Dobb-E can +learn a new task with only five minutes of a user showing it how to do it, +thanks to a demonstration collection tool ("The Stick") we built out of cheap +parts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of +New York City, and train Home Pretrained Representations (HPR). Then, in a +novel home environment, with five minutes of demonstrations and fifteen minutes +of adapting the HPR model, we show that Dobb-E can reliably solve the task on +the Stretch, a mobile robot readily available on the market. Across roughly 30 +days of experimentation in homes of New York City and surrounding areas, we +test our system in 10 homes, with a total of 109 tasks in different +environments, and finally achieve a success rate of 81%. Beyond success +percentages, our experiments reveal a plethora of unique challenges absent or +ignored in lab robotics. These range from effects of strong shadows, to +variable demonstration quality by non-expert users. With the hope of +accelerating research on home robots, and eventually seeing robot butlers in +every home, we open-source Dobb-E software stack and models, our data, and our +hardware designs at https://dobb-e.com + +
+
+ comment: Project website and videos are available at https://dobb-e.com, + technical documentation for getting started is available at + https://docs.dobb-e.com, and code is released at + https://github.com/notmahi/dobb-e +
+
+
+
+
+ + ☆ CG-HOI: Contact-Guided 3D Human-Object Interaction Generation + + +
+ We propose CG-HOI, the first method to address the task of generating dynamic +3D human-object interactions (HOIs) from text. We model the motion of both +human and object in an interdependent fashion, as semantically rich human +motion rarely happens in isolation without any interactions. Our key insight is +that explicitly modeling contact between the human body surface and object +geometry can be used as strong proxy guidance, both during training and +inference. Using this guidance to bridge human and object motion enables +generating more realistic and physically plausible interaction sequences, where +the human body and corresponding object move in a coherent manner. Our method +first learns to model human motion, object motion, and contact in a joint +diffusion process, inter-correlated through cross-attention. We then leverage +this learned contact for guidance during inference synthesis of realistic, +coherent HOIs. Extensive evaluation shows that our joint contact-based +human-object interaction approach generates realistic and physically plausible +sequences, and we show two applications highlighting the capabilities of our +method. Conditioned on a given object trajectory, we can generate the +corresponding human motion without re-training, demonstrating strong +human-object interdependency learning. Our approach is also flexible, and can +be applied to static real-world 3D scene scans. + +
+
+ comment: Project page: https://cg-hoi.christian-diller.de Video: + https://www.youtube.com/watch?v=GNyQwTwZ15s +
+
+
+
+
+ + ☆ Animatable Gaussians: Learning Pose-dependent Gaussian Maps for + High-fidelity Human Avatar Modeling + + +
+ Modeling animatable human avatars from RGB videos is a long-standing and +challenging problem. Recent works usually adopt MLP-based neural radiance +fields (NeRF) to represent 3D humans, but it remains difficult for pure MLPs to +regress pose-dependent garment details. To this end, we introduce Animatable +Gaussians, a new avatar representation that leverages powerful 2D CNNs and 3D +Gaussian splatting to create high-fidelity avatars. To associate 3D Gaussians +with the animatable avatar, we learn a parametric template from the input +videos, and then parameterize the template on two front \& back canonical +Gaussian maps where each pixel represents a 3D Gaussian. The learned template +is adaptive to the wearing garments for modeling looser clothes like dresses. +Such template-guided 2D parameterization enables us to employ a powerful +StyleGAN-based CNN to learn the pose-dependent Gaussian maps for modeling +detailed dynamic appearances. Furthermore, we introduce a pose projection +strategy for better generalization given novel poses. Overall, our method can +create lifelike avatars with dynamic, realistic and generalized appearances. +Experiments show that our method outperforms other state-of-the-art approaches. +Code: https://github.com/lizhe00/AnimatableGaussians + +
+
+ comment: Projectpage: https://animatable-gaussians.github.io/, Code: + https://github.com/lizhe00/AnimatableGaussians +
+
+
+
+
+ + ☆ Street TryOn: Learning In-the-Wild Virtual Try-On from Unpaired Person + Images + + +
+ Virtual try-on has become a popular research topic, but most existing methods +focus on studio images with a clean background. They can achieve plausible +results for this studio try-on setting by learning to warp a garment image to +fit a person's body from paired training data, i.e., garment images paired with +images of people wearing the same garment. Such data is often collected from +commercial websites, where each garment is demonstrated both by itself and on +several models. By contrast, it is hard to collect paired data for in-the-wild +scenes, and therefore, virtual try-on for casual images of people against +cluttered backgrounds is rarely studied. + In this work, we fill the gap in the current virtual try-on research by (1) +introducing a Street TryOn benchmark to evaluate performance on street scenes +and (2) proposing a novel method that can learn without paired data, from a set +of in-the-wild person images directly. Our method can achieve robust +performance across shop and street domains using a novel DensePose warping +correction method combined with diffusion-based inpainting controlled by pose +and semantic segmentation. Our experiments demonstrate competitive performance +for standard studio try-on tasks and SOTA performance for street try-on and +cross-domain try-on tasks. + +
+
+
+
+
+ + ☆ Interactive Autonomous Navigation with Internal State Inference and + Interactivity Estimation + + +
+ Deep reinforcement learning (DRL) provides a promising way for intelligent +agents (e.g., autonomous vehicles) to learn to navigate complex scenarios. +However, DRL with neural networks as function approximators is typically +considered a black box with little explainability and often suffers from +suboptimal performance, especially for autonomous navigation in highly +interactive multi-agent environments. To address these issues, we propose three +auxiliary tasks with spatio-temporal relational reasoning and integrate them +into the standard DRL framework, which improves the decision making performance +and provides explainable intermediate indicators. We propose to explicitly +infer the internal states (i.e., traits and intentions) of surrounding agents +(e.g., human drivers) as well as to predict their future trajectories in the +situations with and without the ego agent through counterfactual reasoning. +These auxiliary tasks provide additional supervision signals to infer the +behavior patterns of other interactive agents. Multiple variants of framework +integration strategies are compared. We also employ a spatio-temporal graph +neural network to encode relations between dynamic entities, which enhances +both internal state inference and decision making of the ego agent. Moreover, +we propose an interactivity estimation mechanism based on the difference +between predicted trajectories in these two situations, which indicates the +degree of influence of the ego agent on other agents. To validate the proposed +method, we design an intersection driving simulator based on the Intelligent +Intersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our +approach achieves robust and state-of-the-art performance in terms of standard +evaluation metrics and provides explainable intermediate indicators (i.e., +internal states, and interactivity scores) for decision making. + +
+
+ comment: 18 pages, 14 figures +
+
+
+
+
+ + ☆ Self-correcting LLM-controlled Diffusion Models + + +
+ Text-to-image generation has witnessed significant progress with the advent +of diffusion models. Despite the ability to generate photorealistic images, +current text-to-image diffusion models still often struggle to accurately +interpret and follow complex input text prompts. In contrast to existing models +that aim to generate images only with their best effort, we introduce +Self-correcting LLM-controlled Diffusion (SLD). SLD is a framework that +generates an image from the input prompt, assesses its alignment with the +prompt, and performs self-corrections on the inaccuracies in the generated +image. Steered by an LLM controller, SLD turns text-to-image generation into an +iterative closed-loop process, ensuring correctness in the resulting image. SLD +is not only training-free but can also be seamlessly integrated with diffusion +models behind API access, such as DALL-E 3, to further boost the performance of +state-of-the-art diffusion models. Experimental results show that our approach +can rectify a majority of incorrect generations, particularly in generative +numeracy, attribute binding, and spatial relationships. Furthermore, by simply +adjusting the instructions to the LLM, SLD can perform image editing tasks, +bridging the gap between text-to-image generation and image editing pipelines. +We will make our code available for future research and applications. + +
+
+ comment: 16 pages, 10 figures +
+
+
+
+
+ + ☆ ViT-Lens-2: Gateway to Omni-modal Intelligence + + +
+ Aiming to advance AI agents, large foundation models significantly improve +reasoning and instruction execution, yet the current focus on vision and +language neglects the potential of perceiving diverse modalities in open-world +environments. However, the success of data-driven vision and language models is +costly or even infeasible to be reproduced for rare modalities. In this paper, +we present ViT-Lens-2 that facilitates efficient omni-modal representation +learning by perceiving novel modalities with a pretrained ViT and aligning them +to a pre-defined space. Specifically, the modality-specific lens is tuned to +project any-modal signals to an intermediate embedding space, which are then +processed by a strong ViT with pre-trained visual knowledge. The encoded +representations are optimized toward aligning with the modal-independent space, +pre-defined by off-the-shelf foundation models. ViT-Lens-2 provides a unified +solution for representation learning of increasing modalities with two +appealing advantages: (i) Unlocking the great potential of pretrained ViTs to +novel modalities effectively with efficient data regime; (ii) Enabling emergent +downstream capabilities through modality alignment and shared ViT parameters. +We tailor ViT-Lens-2 to learn representations for 3D point cloud, depth, audio, +tactile and EEG, and set new state-of-the-art results across various +understanding tasks, such as zero-shot classification. By seamlessly +integrating ViT-Lens-2 into Multimodal Foundation Models, we enable +Any-modality to Text and Image Generation in a zero-shot manner. Code and +models are available at https://github.com/TencentARC/ViT-Lens. + +
+
+ comment: This work is a follow-up of "ViT-Lens: Towards Omni-modal + Representations". arXiv admin note: text overlap with arXiv:2308.10185 +
+
+
+
+
+ + ☆ DiffSLVA: Harnessing Diffusion Models for Sign Language Video + Anonymization + + +
+ Since American Sign Language (ASL) has no standard written form, Deaf signers +frequently share videos in order to communicate in their native language. +However, since both hands and face convey critical linguistic information in +signed languages, sign language videos cannot preserve signer privacy. While +signers have expressed interest, for a variety of applications, in sign +language video anonymization that would effectively preserve linguistic +content, attempts to develop such technology have had limited success, given +the complexity of hand movements and facial expressions. Existing approaches +rely predominantly on precise pose estimations of the signer in video footage +and often require sign language video datasets for training. These requirements +prevent them from processing videos 'in the wild,' in part because of the +limited diversity present in current sign language video datasets. To address +these limitations, our research introduces DiffSLVA, a novel methodology that +utilizes pre-trained large-scale diffusion models for zero-shot text-guided +sign language video anonymization. We incorporate ControlNet, which leverages +low-level image features such as HED (Holistically-Nested Edge Detection) +edges, to circumvent the need for pose estimation. Additionally, we develop a +specialized module dedicated to capturing facial expressions, which are +critical for conveying essential linguistic information in signed languages. We +then combine the above methods to achieve anonymization that better preserves +the essential linguistic content of the original signer. This innovative +methodology makes possible, for the first time, sign language video +anonymization that could be used for real-world applications, which would offer +significant benefits to the Deaf and Hard-of-Hearing communities. We +demonstrate the effectiveness of our approach with a series of signer +anonymization experiments. + +
+
+ comment: Project webpage: https://github.com/Jeffery9707/DiffSLVA +
+
+
+
+
+ + ☆ Exploring Attribute Variations in Style-based GANs using Diffusion + Models + + +
+ Existing attribute editing methods treat semantic attributes as binary, +resulting in a single edit per attribute. However, attributes such as +eyeglasses, smiles, or hairstyles exhibit a vast range of diversity. In this +work, we formulate the task of \textit{diverse attribute editing} by modeling +the multidimensional nature of attribute edits. This enables users to generate +multiple plausible edits per attribute. We capitalize on disentangled latent +spaces of pretrained GANs and train a Denoising Diffusion Probabilistic Model +(DDPM) to learn the latent distribution for diverse edits. Specifically, we +train DDPM over a dataset of edit latent directions obtained by embedding image +pairs with a single attribute change. This leads to latent subspaces that +enable diverse attribute editing. Applying diffusion in the highly compressed +latent space allows us to model rich distributions of edits within limited +computational resources. Through extensive qualitative and quantitative +experiments conducted across a range of datasets, we demonstrate the +effectiveness of our approach for diverse attribute editing. We also showcase +the results of our method applied for 3D editing of various face attributes. + +
+
+ comment: Neurips Workshop on Diffusion Models 2023 +
+
+
+
+
+ + ☆ Relightable 3D Gaussian: Real-time Point Cloud Relighting with BRDF + Decomposition and Ray Tracing + + +
+ We present a novel differentiable point-based rendering framework for +material and lighting decomposition from multi-view images, enabling editing, +ray-tracing, and real-time relighting of the 3D point cloud. Specifically, a 3D +scene is represented as a set of relightable 3D Gaussian points, where each +point is additionally associated with a normal direction, BRDF parameters, and +incident lights from different directions. To achieve robust lighting +estimation, we further divide incident lights of each point into global and +local components, as well as view-dependent visibilities. The 3D scene is +optimized through the 3D Gaussian Splatting technique while BRDF and lighting +are decomposed by physically-based differentiable rendering. Moreover, we +introduce an innovative point-based ray-tracing approach based on the bounding +volume hierarchy for efficient visibility baking, enabling real-time rendering +and relighting of 3D Gaussian points with accurate shadow effects. Extensive +experiments demonstrate improved BRDF estimation and novel view rendering +results compared to state-of-the-art material estimation approaches. Our +framework showcases the potential to revolutionize the mesh-based graphics +pipeline with a relightable, traceable, and editable rendering pipeline solely +based on point cloud. Project +page:https://nju-3dv.github.io/projects/Relightable3DGaussian/. + +
+
+
+
+
+ + ☆ Weakly-Supervised 3D Reconstruction of Clothed Humans via Normal Maps + + +
+ We present a novel deep learning-based approach to the 3D reconstruction of +clothed humans using weak supervision via 2D normal maps. Given a single RGB +image or multiview images, our network infers a signed distance function (SDF) +discretized on a tetrahedral mesh surrounding the body in a rest pose. +Subsequently, inferred pose and camera parameters are used to generate a normal +map from the SDF. A key aspect of our approach is the use of Marching +Tetrahedra to (uniquely) compute a triangulated surface from the SDF on the +tetrahedral mesh, facilitating straightforward differentiation (and thus +backpropagation). Thus, given only ground truth normal maps (with no volumetric +information ground truth information), we can train the network to produce SDF +values from corresponding RGB images. Optionally, an additional multiview loss +leads to improved results. We demonstrate the efficacy of our approach for both +network inference and 3D reconstruction. + +
+
+
+
+
+ + ☆ OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving + + +
+ Understanding how the 3D scene evolves is vital for making decisions in +autonomous driving. Most existing methods achieve this by predicting the +movements of object boxes, which cannot capture more fine-grained scene +information. In this paper, we explore a new framework of learning a world +model, OccWorld, in the 3D Occupancy space to simultaneously predict the +movement of the ego car and the evolution of the surrounding scenes. We propose +to learn a world model based on 3D occupancy rather than 3D bounding boxes and +segmentation maps for three reasons: 1) expressiveness. 3D occupancy can +describe the more fine-grained 3D structure of the scene; 2) efficiency. 3D +occupancy is more economical to obtain (e.g., from sparse LiDAR points). 3) +versatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the +modeling of the world evolution, we learn a reconstruction-based scene +tokenizer on the 3D occupancy to obtain discrete scene tokens to describe the +surrounding scenes. We then adopt a GPT-like spatial-temporal generative +transformer to generate subsequent scene and ego tokens to decode the future +occupancy and ego trajectory. Extensive experiments on the widely used nuScenes +benchmark demonstrate the ability of OccWorld to effectively model the +evolution of the driving scenes. OccWorld also produces competitive planning +results without using instance and map supervision. Code: +https://github.com/wzzheng/OccWorld. + +
+
+ comment: Code is available at: https://github.com/wzzheng/OccWorld +
+
+
+
+
+ + ☆ GaussianEditor: Editing 3D Gaussians Delicately with Text Instructions + + +
+ Recently, impressive results have been achieved in 3D scene editing with text +instructions based on a 2D diffusion model. However, current diffusion models +primarily generate images by predicting noise in the latent space, and the +editing is usually applied to the whole image, which makes it challenging to +perform delicate, especially localized, editing for 3D scenes. Inspired by +recent 3D Gaussian splatting, we propose a systematic framework, named +GaussianEditor, to edit 3D scenes delicately via 3D Gaussians with text +instructions. Benefiting from the explicit property of 3D Gaussians, we design +a series of techniques to achieve delicate editing. Specifically, we first +extract the region of interest (RoI) corresponding to the text instruction, +aligning it to 3D Gaussians. The Gaussian RoI is further used to control the +editing process. Our framework can achieve more delicate and precise editing of +3D scenes than previous methods while enjoying much faster training speed, i.e. +within 20 minutes on a single V100 GPU, more than twice as fast as +Instruct-NeRF2NeRF (45 minutes -- 2 hours). + +
+
+ comment: Project page: https://GaussianEditor.github.io +
+
+
+
+
+ + ☆ Automated Measurement of Vascular Calcification in Femoral + Endarterectomy Patients Using Deep Learning + + +
+ Atherosclerosis, a chronic inflammatory disease affecting the large arteries, +presents a global health risk. Accurate analysis of diagnostic images, like +computed tomographic angiograms (CTAs), is essential for staging and monitoring +the progression of atherosclerosis-related conditions, including peripheral +arterial disease (PAD). However, manual analysis of CTA images is +time-consuming and tedious. To address this limitation, we employed a deep +learning model to segment the vascular system in CTA images of PAD patients +undergoing femoral endarterectomy surgery and to measure vascular calcification +from the left renal artery to the patella. Utilizing proprietary CTA images of +27 patients undergoing femoral endarterectomy surgery provided by Prisma Health +Midlands, we developed a Deep Neural Network (DNN) model to first segment the +arterial system, starting from the descending aorta to the patella, and second, +to provide a metric of arterial calcification. Our designed DNN achieved 83.4% +average Dice accuracy in segmenting arteries from aorta to patella, advancing +the state-of-the-art by 0.8%. Furthermore, our work is the first to present a +robust statistical analysis of automated calcification measurement in the lower +extremities using deep learning, attaining a Mean Absolute Percentage Error +(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and +manual calcification scores. These findings underscore the potential of deep +learning techniques as a rapid and accurate tool for medical professionals to +assess calcification in the abdominal aorta and its branches above the patella. +The developed DNN model and related documentation in this project are available +at GitHub page at https://github.com/pip-alireza/DeepCalcScoring. + +
+
+ comment: Published in MDPI Diagnostic journal, the code can be accessed via + the GitHub link in the paper +
+
+
+
+
+ + ☆ Adversaral Doodles: Interpretable and Human-drawable Attacks Provide + Describable Insights CVPR 2024 + + +
+ DNN-based image classification models are susceptible to adversarial attacks. +Most previous adversarial attacks do not focus on the interpretability of the +generated adversarial examples, and we cannot gain insights into the mechanism +of the target classifier from the attacks. Therefore, we propose Adversarial +Doodles, which have interpretable shapes. We optimize black b\'ezier curves to +fool the target classifier by overlaying them onto the input image. By +introducing random perspective transformation and regularizing the doodled +area, we obtain compact attacks that cause misclassification even when humans +replicate them by hand. Adversarial doodles provide describable and intriguing +insights into the relationship between our attacks and the classifier's output. +We utilize adversarial doodles and discover the bias inherent in the target +classifier, such as "We add two strokes on its head, a triangle onto its body, +and two lines inside the triangle on a bird image. Then, the classifier +misclassifies the image as a butterfly." + +
+
+ comment: Submitted to CVPR 2024 +
+
+
+
+
+ + ☆ Unified Batch Normalization: Identifying and Alleviating the Feature + Condensation in Batch Normalization and a Unified Framework + + +
+ Batch Normalization (BN) has become an essential technique in contemporary +neural network design, enhancing training stability. Specifically, BN employs +centering and scaling operations to standardize features along the batch +dimension and uses an affine transformation to recover features. Although +standard BN has shown its capability to improve deep neural network training +and convergence, it still exhibits inherent limitations in certain cases. Most +existing techniques that enhance BN consider a single or a few aspects of BN. +In this paper, we first identify problems with BN from a feature perspective +and explore that feature condensation exists in the learning when employing BN, +which negatively affects testing performance. To tackle this problem, we +propose a two-stage unified framework called Unified Batch Normalization (UBN). +In the first stage, we utilize a simple feature condensation threshold to +alleviate the feature condensation, which hinders inappropriate statistic +updates in normalization. In the second stage, we unify various normalization +variants to boost each component of BN. Our experimental results reveal that +UBN significantly enhances performance across different visual backbones and +notably expedites network training convergence, particularly in early training +stages. Notably, our method improved about 3% in top-1 accuracy on ImageNet +classification with large batch sizes, showing the effectiveness of our +approach in real-world scenarios. + +
+
+
+
+
+ + ☆ DiffAnt: Diffusion Models for Action Anticipation + + +
+ Anticipating future actions is inherently uncertain. Given an observed video +segment containing ongoing actions, multiple subsequent actions can plausibly +follow. This uncertainty becomes even larger when predicting far into the +future. However, the majority of existing action anticipation models adhere to +a deterministic approach, neglecting to account for future uncertainties. In +this work, we rethink action anticipation from a generative view, employing +diffusion models to capture different possible future actions. In this +framework, future actions are iteratively generated from standard Gaussian +noise in the latent space, conditioned on the observed video, and subsequently +transitioned into the action space. Extensive experiments on four benchmark +datasets, i.e., Breakfast, 50Salads, EpicKitchens, and EGTEA Gaze+, are +performed and the proposed method achieves superior or comparable results to +state-of-the-art methods, showing the effectiveness of a generative approach +for action anticipation. Our code and trained models will be published on +GitHub. + +
+
+
+
+
+ + ☆ Direct2.5: Diverse Text-to-3D Generation via Multi-view 2.5D Diffusion + + +
+ Recent advances in generative AI have unveiled significant potential for the +creation of 3D content. However, current methods either apply a pre-trained 2D +diffusion model with the time-consuming score distillation sampling (SDS), or a +direct 3D diffusion model trained on limited 3D data losing generation +diversity. In this work, we approach the problem by employing a multi-view 2.5D +diffusion fine-tuned from a pre-trained 2D diffusion model. The multi-view 2.5D +diffusion directly models the structural distribution of 3D data, while still +maintaining the strong generalization ability of the original 2D diffusion +model, filling the gap between 2D diffusion-based and direct 3D diffusion-based +methods for 3D content generation. During inference, multi-view normal maps are +generated using the 2.5D diffusion, and a novel differentiable rasterization +scheme is introduced to fuse the almost consistent multi-view normal maps into +a consistent 3D model. We further design a normal-conditioned multi-view image +generation module for fast appearance generation given the 3D geometry. Our +method is a one-pass diffusion process and does not require any SDS +optimization as post-processing. We demonstrate through extensive experiments +that, our direct 2.5D generation with the specially-designed fusion scheme can +achieve diverse, mode-seeking-free, and high-fidelity 3D content generation in +only 10 seconds. Project page: https://nju-3dv.github.io/projects/direct25. + +
+
+ comment: Project webpage: https://nju-3dv.github.io/projects/direct25 +
+
+
+
+
+ + ☆ Text2Loc: 3D Point Cloud Localization from Natural Language + + +
+ We tackle the problem of 3D point cloud localization based on a few natural +linguistic descriptions and introduce a novel neural network, Text2Loc, that +fully interprets the semantic relationship between points and text. Text2Loc +follows a coarse-to-fine localization pipeline: text-submap global place +recognition, followed by fine localization. In global place recognition, +relational dynamics among each textual hint are captured in a hierarchical +transformer with max-pooling (HTM), whereas a balance between positive and +negative pairs is maintained using text-submap contrastive learning. Moreover, +we propose a novel matching-free fine localization method to further refine the +location predictions, which completely removes the need for complicated +text-instance matching and is lighter, faster, and more accurate than previous +methods. Extensive experiments show that Text2Loc improves the localization +accuracy by up to $2\times$ over the state-of-the-art on the KITTI360Pose +dataset. We will make the code publicly available. + +
+
+ comment: 10 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ FALCON: Fairness Learning via Contrastive Attention Approach to + Continual Semantic Scene Understanding in Open World + + +
+ Continual Learning in semantic scene segmentation aims to continually learn +new unseen classes in dynamic environments while maintaining previously learned +knowledge. Prior studies focused on modeling the catastrophic forgetting and +background shift challenges in continual learning. However, fairness, another +major challenge that causes unfair predictions leading to low performance among +major and minor classes, still needs to be well addressed. In addition, prior +methods have yet to model the unknown classes well, thus resulting in producing +non-discriminative features among unknown classes. This paper presents a novel +Fairness Learning via Contrastive Attention Approach to continual learning in +semantic scene understanding. In particular, we first introduce a new Fairness +Contrastive Clustering loss to address the problems of catastrophic forgetting +and fairness. Then, we propose an attention-based visual grammar approach to +effectively model the background shift problem and unknown classes, producing +better feature representations for different unknown classes. Through our +experiments, our proposed approach achieves State-of-the-Art (SOTA) performance +on different continual learning settings of three standard benchmarks, i.e., +ADE20K, Cityscapes, and Pascal VOC. It promotes the fairness of the continual +semantic segmentation model. + +
+
+
+
+
+ + ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+
+
+
+ + ☆ From Pixels to Titles: Video Game Identification by Screenshots using + Convolutional Neural Networks + + +
+ This paper investigates video game identification through single screenshots, +utilizing five convolutional neural network (CNN) architectures (MobileNet, +DenseNet, EfficientNetB0, EfficientNetB2, and EfficientNetB3) across 22 home +console systems, spanning from Atari 2600 to PlayStation 5. Confirming the +hypothesis, CNNs autonomously extract image features, enabling the +identification of game titles from screenshots without additional features. +Using ImageNet pre-trained weights, EfficientNetB3 achieves the highest average +accuracy (74.51%), while DenseNet169 excels in 14 of the 22 systems. Employing +alternative initial weights from another screenshots dataset boosts accuracy +for EfficientNetB2 and EfficientNetB3, with the latter reaching a peak accuracy +of 76.36% and demonstrating reduced convergence epochs from 23.7 to 20.5 on +average. Overall, the combination of optimal architecture and weights attains +77.67% accuracy, primarily led by EfficientNetB3 in 19 systems. These findings +underscore the efficacy of CNNs in video game identification through +screenshots. + +
+
+
+
+
+ + ☆ Tell2Design: A Dataset for Language-Guided Floor Plan Generation ACL2023 + + +
+ We consider the task of generating designs directly from natural language +descriptions, and consider floor plan generation as the initial research area. +Language conditional generative models have recently been very successful in +generating high-quality artistic images. However, designs must satisfy +different constraints that are not present in generating artistic images, +particularly spatial and relational constraints. We make multiple contributions +to initiate research on this task. First, we introduce a novel dataset, +\textit{Tell2Design} (T2D), which contains more than $80k$ floor plan designs +associated with natural language instructions. Second, we propose a +Sequence-to-Sequence model that can serve as a strong baseline for future +research. Third, we benchmark this task with several text-conditional image +generation models. We conclude by conducting human evaluations on the generated +samples and providing an analysis of human performance. We hope our +contributions will propel the research on language-guided design generation +forward. + +
+
+ comment: Paper published in ACL2023; Area Chair Award; Best Paper Nomination +
+
+
+
+
+ + ☆ Unleashing the Power of Prompt-driven Nucleus Instance Segmentation + + +
+ Nuclear instance segmentation in histology images is crucial for a broad +spectrum of clinical applications. Current prevailing nuclear instance +segmentation algorithms rely on regression of nuclei contours, distance maps, +watershed markers or a proxy nuclear representation of star-convex polygons. +Consequently, these methods necessitate sophisticated post-processing +operations to distinguish nuclei instances, which are commonly acknowledged to +be error-prone and parameter-sensitive. Recently, the segment anything model +(SAM) has earned attracted huge attention within the domain of medical image +segmentation due to its impressive generalization ability and promptable +property. Nevertheless, its potential on nuclear instance segmentation remains +largely underexplored. In this paper, we present a novel prompt-driven +framework that consists of a point prompter and a SAM for automatic nuclei +instance segmentation. Specifically, the prompter learns to generate a unique +point prompt for each nucleus while the SAM is fine tuned to output the +corresponding mask of the cued nucleus. Furthermore, we propose to add adjacent +nuclei as negative prompts to promote the model's ability to recognize +overlapping nuclei. Without bells and whistles, our proposed method sets a new +state-of-the-art performance on three challenging benchmarks. Our code is +available at +\textcolor{magenta}{\url{https://github.com/windygoo/PromptNucSeg}} . + +
+
+
+
+
+ + ☆ Optimal Transport Aggregation for Visual Place Recognition + + +
+ The task of Visual Place Recognition (VPR) aims to match a query image +against references from an extensive database of images from different places, +relying solely on visual cues. State-of-the-art pipelines focus on the +aggregation of features extracted from a deep backbone, in order to form a +global descriptor for each image. In this context, we introduce SALAD (Sinkhorn +Algorithm for Locally Aggregated Descriptors), which reformulates NetVLAD's +soft-assignment of local features to clusters as an optimal transport problem. +In SALAD, we consider both feature-to-cluster and cluster-to-feature relations +and we also introduce a 'dustbin' cluster, designed to selectively discard +features deemed non-informative, enhancing the overall descriptor quality. +Additionally, we leverage and fine-tune DINOv2 as a backbone, which provides +enhanced description power for the local features, and dramatically reduces the +required training time. As a result, our single-stage method not only surpasses +single-stage baselines in public VPR datasets, but also surpasses two-stage +methods that add a re-ranking with significantly higher cost. Code and models +are available at https://github.com/serizba/salad. + +
+
+
+
+
+ + ☆ ADM-Loc: Actionness Distribution Modeling for Point-supervised Temporal + Action Localization + + +
+ This paper addresses the challenge of point-supervised temporal action +detection, in which only one frame per action instance is annotated in the +training set. Self-training aims to provide supplementary supervision for the +training process by generating pseudo-labels (action proposals) from a base +model. However, most current methods generate action proposals by applying +manually designed thresholds to action classification probabilities and +treating adjacent snippets as independent entities. As a result, these methods +struggle to generate complete action proposals, exhibit sensitivity to +fluctuations in action classification scores, and generate redundant and +overlapping action proposals. This paper proposes a novel framework termed +ADM-Loc, which stands for Actionness Distribution Modeling for point-supervised +action Localization. ADM-Loc generates action proposals by fitting a composite +distribution, comprising both Gaussian and uniform distributions, to the action +classification signals. This fitting process is tailored to each action class +present in the video and is applied separately for each action instance, +ensuring the distinctiveness of their distributions. ADM-Loc significantly +enhances the alignment between the generated action proposals and ground-truth +action instances and offers high-quality pseudo-labels for self-training. +Moreover, to model action boundary snippets, it enforces consistency in action +classification scores during training by employing Gaussian kernels, supervised +with the proposed loss functions. ADM-Loc outperforms the state-of-the-art +point-supervised methods on THUMOS14 and ActivityNet-v1.2 datasets. + +
+
+
+
+
+ + ☆ Computer Vision for Carriers: PATRIOT + + +
+ Deck tracking performed on carriers currently involves a team of sailors +manually identifying aircraft and updating a digital user interface called the +Ouija Board. Improvements to the deck tracking process would result in +increased Sortie Generation Rates, and therefore applying automation is seen as +a critical method to improve deck tracking. However, the requirements on a +carrier ship do not allow for the installation of hardware-based location +sensing technologies like Global Positioning System (GPS) sensors. PATRIOT +(Panoramic Asset Tracking of Real-Time Information for the Ouija Tabletop) is a +research effort and proposed solution to performing deck tracking with passive +sensing and without the need for GPS sensors. PATRIOT is a prototype system +which takes existing camera feeds, calculates aircraft poses, and updates a +virtual Ouija board interface with the current status of the assets. PATRIOT +would allow for faster, more accurate, and less laborious asset tracking for +aircraft, people, and support equipment. PATRIOT is anticipated to benefit the +warfighter by reducing cognitive workload, reducing manning requirements, +collecting data to improve logistics, and enabling an automation gateway for +future efforts to improve efficiency and safety. The authors have developed and +tested algorithms to perform pose estimations of assets in real-time including +OpenPifPaf, High-Resolution Network (HRNet), HigherHRNet (HHRNet), Faster +R-CNN, and in-house developed encoder-decoder network. The software was tested +with synthetic and real-world data and was able to accurately extract the pose +of assets. Fusion, tracking, and real-world generality are planned to be +improved to ensure a successful transition to the fleet. + +
+
+ comment: 8 pages, 18 figures. Published in the Proceedings of the ASNE 2023 + Technology, Systems & Ships Symposium. Reproduced with permission from the + American Society of Naval Engineers. Distribution Statement A: Approved for + public release; distribution is unlimited, as submitted under NAVAIR Public + Release Authorization 2023-019 +
+
+
+
+
+ + ☆ LIFT OFF: LoRaWAN Installation and Fiducial Tracking Operations for the + Flightline of the Future + + +
+ Real-time situational awareness for the location of assets is critical to +ensure missions are completed efficiently and requirements are satisfied. In +many commercial settings, the application of global positioning system (GPS) +sensors is appropriate to achieve timely knowledge of the position of people +and equipment. However, GPS sensors are not appropriate for all situations due +to flight clearance and operations security concerns. LIFT OFF: LoRaWAN +Installation and Fiducial Tracking Operations for the Flightline of the Future +proposes a hybrid framework solution to achieve real-time situational awareness +for people, support equipment, and aircraft positions regardless of the +environment. This framework included a machine-vision component, which involved +setting up cameras to detect AprilTag decals that were installed on the sides +of aircraft. The framework included a geolocation sensor component, which +involved installing GPS sensors on support equipment and helmets. The framework +also included creating a long-range wide area network (LoRaWAN) to transfer +data and developing a user interface to display the data. The framework was +tested at Naval Air Station Oceana Flightline, the United States Naval Test +Pilot School, and at Naval Air Warfare Center Aircraft Division Lakehurst. LIFT +OFF successfully provided a real-time updating map of all tracked assets using +GPS sensors for people and support equipment and with visual fiducials for +aircraft. The trajectories of the assets were recorded for logistical analysis +and playback. Future follow-on work is anticipated to apply the technology to +other environments including carriers and amphibious assault ships in addition +to the flightline. + +
+
+ comment: 6 pages, 11 figures. Published in the Proceedings of the ASNE 2023 + Technology, Systems & Ships Symposium. Reproduced with permission from the + American Society of Naval Engineers. Distribution Statement A: Approved for + public release; distribution is unlimited, as submitted under NAVAIR Public + Release Authorization 2023-020 +
+
+
+
+
+ + ☆ Enhancing Perceptual Quality in Video Super-Resolution through + Temporally-Consistent Detail Synthesis using Diffusion Models + + +
+ In this paper, we address the problem of video super-resolution (VSR) using +Diffusion Models (DM), and present StableVSR. Our method significantly enhances +the perceptual quality of upscaled videos by synthesizing realistic and +temporally-consistent details. We turn a pre-trained DM for single image +super-resolution into a VSR method by introducing the Temporal Conditioning +Module (TCM). TCM uses Temporal Texture Guidance, which provides +spatially-aligned and detail-rich texture information synthesized in adjacent +frames. This guides the generative process of the current frame toward +high-quality and temporally-consistent results. We introduce a Frame-wise +Bidirectional Sampling strategy to encourage the use of information from past +to future and vice-versa. This strategy improves the perceptual quality of the +results and the temporal consistency across frames. We demonstrate the +effectiveness of StableVSR in enhancing the perceptual quality of upscaled +videos compared to existing state-of-the-art methods for VSR. The code is +available at https://github.com/claudiom4sir/StableVSR. + +
+
+
+
+
+ + ☆ MetaDefa: Meta-learning based on Domain Enhancement and Feature + Alignment for Single Domain Generalization + + +
+ The single domain generalization(SDG) based on meta-learning has emerged as +an effective technique for solving the domain-shift problem. However, the +inadequate match of data distribution between source and augmented domains and +difficult separation of domain-invariant features from domain-related features +make SDG model hard to achieve great generalization. Therefore, a novel +meta-learning method based on domain enhancement and feature alignment +(MetaDefa) is proposed to improve the model generalization performance. First, +the background substitution and visual corruptions techniques are used to +generate diverse and effective augmented domains. Then, the multi-channel +feature alignment module based on class activation maps and class agnostic +activation maps is designed to effectively extract adequate transferability +knowledge. In this module, domain-invariant features can be fully explored by +focusing on similar target regions between source and augmented domains feature +space and suppressing the feature representation of non-similar target regions. +Extensive experiments on two publicly available datasets show that MetaDefa has +significant generalization performance advantages in unknown multiple target +domains. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Data Generation for Post-OCR correction of Cyrillic handwriting + + +
+ This paper introduces a novel approach to post-Optical Character Recognition +Correction (POC) for handwritten Cyrillic text, addressing a significant gap in +current research methodologies. This gap is due to the lack of large text +corporas that provide OCR errors for further training of language-based POC +models, which are demanding in terms of corpora size. Our study primarily +focuses on the development and application of a synthetic handwriting +generation engine based on B\'ezier curves. Such an engine generates highly +realistic handwritten text in any amounts, which we utilize to create a +substantial dataset by transforming Russian text corpora sourced from the +internet. We apply a Handwritten Text Recognition (HTR) model to this dataset +to identify OCR errors, forming the basis for our POC model training. The +correction model is trained on a 90-symbol input context, utilizing a +pre-trained T5 architecture with a seq2seq correction task. We evaluate our +approach on HWR200 and School_notebooks_RU datasets as they provide significant +challenges in the HTR domain. Furthermore, POC can be used to highlight errors +for teachers, evaluating student performance. This can be done simply by +comparing sentences before and after correction, displaying differences in +text. Our primary contribution lies in the innovative use of B\'ezier curves +for Cyrillic text generation and subsequent error correction using a +specialized POC model. We validate our approach by presenting Word Accuracy +Rate (WAR) and Character Accuracy Rate (CAR) results, both with and without +post-OCR correction, using real open corporas of handwritten Cyrillic text. +These results, coupled with our methodology, are designed to be reproducible, +paving the way for further advancements in the field of OCR and handwritten +text analysis. Paper contributions can be found in +https://github.com/dbrainio/CyrillicHandwritingPOC + +
+
+ comment: 17 pages, 27 figures, 6 tables, 26 references +
+
+
+
+
+ + ☆ Stability-Informed Initialization of Neural Ordinary Differential + Equations + + +
+ This paper addresses the training of Neural Ordinary Differential Equations +(neural ODEs), and in particular explores the interplay between numerical +integration techniques, stability regions, step size, and initialization +techniques. It is shown how the choice of integration technique implicitly +regularizes the learned model, and how the solver's corresponding stability +region affects training and prediction performance. From this analysis, a +stability-informed parameter initialization technique is introduced. The +effectiveness of the initialization method is displayed across several learning +benchmarks and industrial applications. + +
+
+
+
+
+ + ☆ EVCap: Retrieval-Augmented Image Captioning with External Visual-Name + Memory for Open-World Comprehension + + +
+ Large language models (LLMs)-based image captioning has the capability of +describing objects not explicitly observed in training data; yet novel objects +occur frequently, necessitating the requirement of sustaining up-to-date object +knowledge for open-world comprehension. Instead of relying on large amounts of +data and scaling up network parameters, we introduce a highly effective +retrieval-augmented image captioning method that prompts LLMs with object names +retrieved from External Visual--name memory (EVCap). We build ever-changing +object knowledge memory using objects' visuals and names, enabling us to (i) +update the memory at a minimal cost and (ii) effortlessly augment LLMs with +retrieved object names utilizing a lightweight and fast-to-train model. Our +model, which was trained only on the COCO dataset, can be adapted to out-domain +data without additional fine-tuning or retraining. Our comprehensive +experiments conducted on various benchmarks and synthetic commonsense-violating +data demonstrate that EVCap, comprising solely 3.97M trainable parameters, +exhibits superior performance compared to other methods of equivalent model +size scale. Notably, it achieves competitive performance against specialist +SOTAs with an enormous number of parameters. Our code is available at +https://jiaxuan-li.github.io/EVCap. + +
+
+ comment: Project page: https://jiaxuan-li.github.io/EVCap +
+
+
+
+
+ + ☆ RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation + and Consistency Regularization + + +
+ Recent advancements in Artificial Intelligence (AI) have profoundly +influenced medical fields, by providing tools to reduce clinical workloads. +However, most AI models are constrained to execute uni-modal tasks, in stark +contrast to the comprehensive approaches utilized by medical professionals. To +address this, here we present RO-LLaMA, a versatile generalist large language +model (LLM) tailored for the field of radiation oncology. This model seamlessly +covers a wide range of the workflow of radiation oncologists, adept at various +tasks such as clinical report summarization, radiation therapy plan suggestion, +and plan-guided therapy target volume segmentation. In particular, to maximize +the end-to-end performance, we further present a novel Consistency Embedding +Fine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional +errors at the intermediates while preserving the capability of handling clean +inputs, and creatively transform this concept into LLM-driven segmentation +framework as Consistency Embedding Segmentation (CESEG). Experimental results +on multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising +performance for diverse tasks with generalization capabilities. + +
+
+
+
+
+ + ☆ InterControl: Generate Human Motion Interactions by Controlling Every + Joint + + +
+ Text-conditioned human motion generation model has achieved great progress by +introducing diffusion models and corresponding control signals. However, the +interaction between humans are still under explored. To model interactions of +arbitrary number of humans, we define interactions as human joint pairs that +are either in contact or separated, and leverage {\em Large Language Model +(LLM) Planner} to translate interaction descriptions into contact plans. Based +on the contact plans, interaction generation could be achieved by spatially +controllable motion generation methods by taking joint contacts as spatial +conditions. We present a novel approach named InterControl for flexible spatial +control of every joint in every person at any time by leveraging motion +diffusion model only trained on single-person data. We incorporate a motion +controlnet to generate coherent and realistic motions given sparse spatial +control signals and a loss guidance module to precisely align any joint to the +desired position in a classifier guidance manner via Inverse Kinematics (IK). +Extensive experiments on HumanML3D and KIT-ML dataset demonstrate its +effectiveness in versatile joint control. We also collect data of joint contact +pairs by LLMs to show InterControl's ability in human interaction generation. + +
+
+ comment: Generate human interactions with only single-person motion diffusion + model via LLM generated joint contact pairs, code + https://github.com/zhenzhiwang/intercontrol +
+
+
+
+
+ + ☆ JSSL: Joint Supervised and Self-supervised Learning for MRI + Reconstruction + + +
+ Magnetic Resonance Imaging represents an important diagnostic modality; +however, its inherently slow acquisition process poses challenges in obtaining +fully sampled k-space data under motion in clinical scenarios such as +abdominal, cardiac, and prostate imaging. In the absence of fully sampled +acquisitions, which can serve as ground truth data, training deep learning +algorithms in a supervised manner to predict the underlying ground truth image +becomes an impossible task. To address this limitation, self-supervised methods +have emerged as a viable alternative, leveraging available subsampled k-space +data to train deep learning networks for MRI reconstruction. Nevertheless, +these self-supervised approaches often fall short when compared to supervised +methodologies. In this paper, we introduce JSSL (Joint Supervised and +Self-supervised Learning), a novel training approach for deep learning-based +MRI reconstruction algorithms aimed at enhancing reconstruction quality in +scenarios where target dataset(s) containing fully sampled k-space measurements +are unavailable. Our proposed method operates by simultaneously training a +model in a self-supervised learning setting, using subsampled data from the +target dataset(s), and in a supervised learning manner, utilizing data from +other datasets, referred to as proxy datasets, where fully sampled k-space data +is accessible. To demonstrate the efficacy of JSSL, we utilized subsampled +prostate parallel MRI measurements as the target dataset, while employing fully +sampled brain and knee k-space acquisitions as proxy datasets. Our results +showcase a substantial improvement over conventional self-supervised training +methods, thereby underscoring the effectiveness of our joint approach. We +provide a theoretical motivation for JSSL and establish a practical +"rule-of-thumb" for selecting the most appropriate training approach for deep +MRI reconstruction. + +
+
+ comment: 26 pages, 11 figures, 6 tables +
+
+
+
+
+ + ☆ SiTH: Single-view Textured Human Reconstruction with Image-Conditioned + Diffusion + + +
+ A long-standing goal of 3D human reconstruction is to create lifelike and +fully detailed 3D humans from single images. The main challenge lies in +inferring unknown human shapes, clothing, and texture information in areas not +visible in the images. To address this, we propose SiTH, a novel pipeline that +uniquely integrates an image-conditioned diffusion model into a 3D mesh +reconstruction workflow. At the core of our method lies the decomposition of +the ill-posed single-view reconstruction problem into hallucination and +reconstruction subproblems. For the former, we employ a powerful generative +diffusion model to hallucinate back appearances from the input images. For the +latter, we leverage skinned body meshes as guidance to recover full-body +texture meshes from the input and back-view images. Our designs enable training +of the pipeline with only about 500 3D human scans while maintaining its +generality and robustness. Extensive experiments and user studies on two 3D +reconstruction benchmarks demonstrated the efficacy of our method in generating +realistic, fully textured 3D humans from a diverse range of unseen images. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Single-Model and Any-Modality for Video Object Tracking + + +
+ In the realm of video object tracking, auxiliary modalities such as depth, +thermal, or event data have emerged as valuable assets to complement the RGB +trackers. In practice, most existing RGB trackers learn a single set of +parameters to use them across datasets and applications. However, a similar +single-model unification for multi-modality tracking presents several +challenges. These challenges stem from the inherent heterogeneity of inputs -- +each with modality-specific representations, the scarcity of multi-modal +datasets, and the absence of all the modalities at all times. In this work, we +introduce Un-Track, a \underline{Un}ified Tracker of a single set of parameters +for any modality. To handle any modality, our method learns their common latent +space through low-rank factorization and reconstruction techniques. More +importantly, we use only the RGB-X pairs to learn the common latent space. This +unique shared representation seamlessly binds all modalities together, enabling +effective unification and accommodating any missing modality, all within a +single transformer-based architecture and without the need for +modality-specific fine-tuning. Our Un-Track achieves +8.1 absolute F-score +gain, on the DepthTrack dataset, by introducing only +2.14 (over 21.50) GFLOPs +with +6.6M (over 93M) parameters, through a simple yet efficient prompting +strategy. Extensive comparisons on five benchmark datasets with different +modalities show that Un-Track surpasses both SOTA unified trackers and +modality-specific finetuned counterparts, validating our effectiveness and +practicality. + +
+
+
+
+
+ + ☆ Cell Maps Representation For Lung Adenocarcinoma Growth Patterns + Classification In Whole Slide Images + + +
+ Lung adenocarcinoma is a morphologically heterogeneous disease, characterized +by five primary histologic growth patterns. The quantity of these patterns can +be related to tumor behavior and has a significant impact on patient prognosis. +In this work, we propose a novel machine learning pipeline capable of +classifying tissue tiles into one of the five patterns or as non-tumor, with an +Area Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97. +Our model's strength lies in its comprehensive consideration of cellular +spatial patterns, where it first generates cell maps from Hematoxylin and Eosin +(H&E) whole slide images (WSIs), which are then fed into a convolutional neural +network classification model. Exploiting these cell maps provides the model +with robust generalizability to new data, achieving approximately 30% higher +accuracy on unseen test-sets compared to current state of the art approaches. +The insights derived from our model can be used to predict prognosis, enhancing +patient outcomes. + +
+
+
+
+
+ + ☆ Learning with Noisy Low-Cost MOS for Image Quality Assessment via + Dual-Bias Calibration + + +
+ Learning based image quality assessment (IQA) models have obtained impressive +performance with the help of reliable subjective quality labels, where mean +opinion score (MOS) is the most popular choice. However, in view of the +subjective bias of individual annotators, the labor-abundant MOS (LA-MOS) +typically requires a large collection of opinion scores from multiple +annotators for each image, which significantly increases the learning cost. In +this paper, we aim to learn robust IQA models from low-cost MOS (LC-MOS), which +only requires very few opinion scores or even a single opinion score for each +image. More specifically, we consider the LC-MOS as the noisy observation of +LA-MOS and enforce the IQA model learned from LC-MOS to approach the unbiased +estimation of LA-MOS. In this way, we represent the subjective bias between +LC-MOS and LA-MOS, and the model bias between IQA predictions learned from +LC-MOS and LA-MOS (i.e., dual-bias) as two latent variables with unknown +parameters. By means of the expectation-maximization based alternating +optimization, we can jointly estimate the parameters of the dual-bias, which +suppresses the misleading of LC-MOS via a gated dual-bias calibration (GDBC) +module. To the best of our knowledge, this is the first exploration of robust +IQA model learning from noisy low-cost labels. Theoretical analysis and +extensive experiments on four popular IQA datasets show that the proposed +method is robust toward different bias rates and annotation numbers and +significantly outperforms the other learning based IQA models when only LC-MOS +is available. Furthermore, we also achieve comparable performance with respect +to the other models learned with LA-MOS. + +
+
+
+
+
+ + ☆ Learning Disentangled Identifiers for Action-Customized Text-to-Image + Generation + + +
+ This study focuses on a novel task in text-to-image (T2I) generation, namely +action customization. The objective of this task is to learn the co-existing +action from limited data and generalize it to unseen humans or even animals. +Experimental results show that existing subject-driven customization methods +fail to learn the representative characteristics of actions and struggle in +decoupling actions from context features, including appearance. To overcome the +preference for low-level features and the entanglement of high-level features, +we propose an inversion-based method Action-Disentangled Identifier (ADI) to +learn action-specific identifiers from the exemplar images. ADI first expands +the semantic conditioning space by introducing layer-wise identifier tokens, +thereby increasing the representational richness while distributing the +inversion across different features. Then, to block the inversion of +action-agnostic features, ADI extracts the gradient invariance from the +constructed sample triples and masks the updates of irrelevant channels. To +comprehensively evaluate the task, we present an ActionBench that includes a +variety of actions, each accompanied by meticulously selected samples. Both +quantitative and qualitative results show that our ADI outperforms existing +baselines in action-customized T2I generation. + +
+
+
+
+
+ + ☆ Syn3DWound: A Synthetic Dataset for 3D Wound Bed Analysis + + +
+ Wound management poses a significant challenge, particularly for bedridden +patients and the elderly. Accurate diagnostic and healing monitoring can +significantly benefit from modern image analysis, providing accurate and +precise measurements of wounds. Despite several existing techniques, the +shortage of expansive and diverse training datasets remains a significant +obstacle to constructing machine learning-based frameworks. This paper +introduces Syn3DWound, an open-source dataset of high-fidelity simulated wounds +with 2D and 3D annotations. We propose baseline methods and a benchmarking +framework for automated 3D morphometry analysis and 2D/3D wound segmentation. + +
+
+
+
+
+ + ☆ A-JEPA: Joint-Embedding Predictive Architecture Can Listen + + +
+ This paper presents that the masked-modeling principle driving the success of +large foundational vision models can be effectively applied to audio by making +predictions in a latent space. We introduce Audio-based Joint-Embedding +Predictive Architecture (A-JEPA), a simple extension method for self-supervised +learning from the audio spectrum. Following the design of I-JPEA, our A-JEPA +encodes visible audio spectrogram patches with a curriculum masking strategy +via context encoder, and predicts the representations of regions sampled at +well-designed locations. The target representations of those regions are +extracted by the exponential moving average of context encoder, \emph{i.e.}, +target encoder, on the whole spectrogram. We find it beneficial to transfer +random block masking into time-frequency aware masking in a curriculum manner, +considering the complexity of highly correlated in local time and frequency in +audio spectrograms. To enhance contextual semantic understanding and +robustness, we fine-tune the encoder with a regularized masking on target +datasets, instead of input dropping or zero. Empirically, when built with +Vision Transformers structure, we find A-JEPA to be highly scalable and sets +new state-of-the-art performance on multiple audio and speech classification +tasks, outperforming other recent models that use externally supervised +pre-training. + +
+
+
+
+
+ + ☆ FlowZero: Zero-Shot Text-to-Video Synthesis with LLM-Driven Dynamic + Scene Syntax + + +
+ Text-to-video (T2V) generation is a rapidly growing research area that aims +to translate the scenes, objects, and actions within complex video text into a +sequence of coherent visual frames. We present FlowZero, a novel framework that +combines Large Language Models (LLMs) with image diffusion models to generate +temporally-coherent videos. FlowZero uses LLMs to understand complex +spatio-temporal dynamics from text, where LLMs can generate a comprehensive +dynamic scene syntax (DSS) containing scene descriptions, object layouts, and +background motion patterns. These elements in DSS are then used to guide the +image diffusion model for video generation with smooth object motions and +frame-to-frame coherence. Moreover, FlowZero incorporates an iterative +self-refinement process, enhancing the alignment between the spatio-temporal +layouts and the textual prompts for the videos. To enhance global coherence, we +propose enriching the initial noise of each frame with motion dynamics to +control the background movement and camera motion adaptively. By using +spatio-temporal syntaxes to guide the diffusion process, FlowZero achieves +improvement in zero-shot video synthesis, generating coherent videos with vivid +motion. + +
+
+ comment: Project page: https://flowzero-video.github.io +
+
+
+
+
+ + ☆ C-SAW: Self-Supervised Prompt Learning for Image Generalization in + Remote Sensing + + +
+ We focus on domain and class generalization problems in analyzing optical +remote sensing images, using the large-scale pre-trained vision-language model +(VLM), CLIP. While contrastively trained VLMs show impressive zero-shot +generalization performance, their effectiveness is limited when dealing with +diverse domains during training and testing. Existing prompt learning +techniques overlook the importance of incorporating domain and content +information into the prompts, which results in a drop in performance while +dealing with such multi-domain data. To address these challenges, we propose a +solution that ensures domain-invariant prompt learning while enhancing the +expressiveness of visual features. We observe that CLIP's vision encoder +struggles to identify contextual image information, particularly when image +patches are jumbled up. This issue is especially severe in optical remote +sensing images, where land-cover classes exhibit well-defined contextual +appearances. To this end, we introduce C-SAW, a method that complements CLIP +with a self-supervised loss in the visual space and a novel prompt learning +technique that emphasizes both visual domain and content-specific features. We +keep the CLIP backbone frozen and introduce a small set of projectors for both +the CLIP encoders to train C-SAW contrastively. Experimental results +demonstrate the superiority of C-SAW across multiple remote sensing benchmarks +and different generalization tasks. + +
+
+ comment: Accepted in ACM ICVGIP 2023 +
+
+
+
+
+ + ☆ PIPE : Parallelized Inference Through Post-Training Quantization + Ensembling of Residual Expansions + + +
+ Deep neural networks (DNNs) are ubiquitous in computer vision and natural +language processing, but suffer from high inference cost. This problem can be +addressed by quantization, which consists in converting floating point +perations into a lower bit-width format. With the growing concerns on privacy +rights, we focus our efforts on data-free methods. However, such techniques +suffer from their lack of adaptability to the target devices, as a hardware +typically only support specific bit widths. Thus, to adapt to a variety of +devices, a quantization method shall be flexible enough to find good accuracy +v.s. speed trade-offs for every bit width and target device. To achieve this, +we propose PIPE, a quantization method that leverages residual error expansion, +along with group sparsity and an ensemble approximation for better +parallelization. PIPE is backed off by strong theoretical guarantees and +achieves superior performance on every benchmarked application (from vision to +NLP tasks), architecture (ConvNets, transformers) and bit-width (from int8 to +ternary quantization). + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2203.14645 +
+
+
+
+
+ + ☆ SOAC: Spatio-Temporal Overlap-Aware Multi-Sensor Calibration using + Neural Radiance Fields + + +
+ In rapidly-evolving domains such as autonomous driving, the use of multiple +sensors with different modalities is crucial to ensure high operational +precision and stability. To correctly exploit the provided information by each +sensor in a single common frame, it is essential for these sensors to be +accurately calibrated. In this paper, we leverage the ability of Neural +Radiance Fields (NeRF) to represent different sensors modalities in a common +volumetric representation to achieve robust and accurate spatio-temporal sensor +calibration. By designing a partitioning approach based on the visible part of +the scene for each sensor, we formulate the calibration problem using only the +overlapping areas. This strategy results in a more robust and accurate +calibration that is less prone to failure. We demonstrate that our approach +works on outdoor urban scenes by validating it on multiple established driving +datasets. Results show that our method is able to get better accuracy and +robustness compared to existing methods. + +
+
+ comment: Paper + Supplementary, under review +
+
+
+
+
+ + ☆ Relationship between Model Compression and Adversarial Robustness: A + Review of Current Evidence SC + + +
+ Increasing the model capacity is a known approach to enhance the adversarial +robustness of deep learning networks. On the other hand, various model +compression techniques, including pruning and quantization, can reduce the size +of the network while preserving its accuracy. Several recent studies have +addressed the relationship between model compression and adversarial +robustness, while some experiments have reported contradictory results. This +work summarizes available evidence and discusses possible explanations for the +observed effects. + +
+
+ comment: Accepted for publication at SSCI 2023 +
+
+
+
+
+ + ☆ Stable Segment Anything Model + + +
+ The Segment Anything Model (SAM) achieves remarkable promptable segmentation +given high-quality prompts which, however, often require good skills to +specify. To make SAM robust to casual prompts, this paper presents the first +comprehensive analysis on SAM's segmentation stability across a diverse +spectrum of prompt qualities, notably imprecise bounding boxes and insufficient +points. Our key finding reveals that given such low-quality prompts, SAM's mask +decoder tends to activate image features that are biased towards the background +or confined to specific object parts. To mitigate this issue, our key idea +consists of adjusting the sampling locations of image feature using learnable +deformable offsets, while the original SAM model architecture and weights +remain unchanged. Consequently, our deformable sampling plugin (DSP) enables +SAM to adaptively shift attention to the prompted target regions in a +data-driven manner, facilitated by our effective robust training strategy +(RTS). During inference, dynamic routing plugin (DRP) is proposed that toggles +SAM between the deformable and regular grid sampling modes, conditioned on the +input prompt quality. Thus, our solution, termed Stable-SAM, is one of its kind +focusing on solely adjusting feature sampling locations, which offers several +advantages: 1) improved SAM's segmentation stability across a wide range of +prompt qualities, while 2) retaining SAM's powerful promptable segmentation +efficiency and generality, with 3) minimal learnable parameters (0.08 M) and +fast adaptation (by 1 training epoch). Extensive experiments across multiple +datasets validate the effectiveness and advantages of our approach, +underscoring Stable-SAM as a more robust solution for segmenting anything. +Codes will be released upon acceptance. + +
+
+ comment: Codes will be released upon acceptance +
+
+
+
+
+ + ☆ Check, Locate, Rectify: A Training-Free Layout Calibration System for + Text-to-Image Generation + + +
+ Diffusion models have recently achieved remarkable progress in generating +realistic images. However, challenges remain in accurately understanding and +synthesizing the layout requirements in the textual prompts. To align the +generated image with layout instructions, we present a training-free layout +calibration system SimM that intervenes in the generative process on the fly +during inference time. Specifically, following a "check-locate-rectify" +pipeline, the system first analyses the prompt to generate the target layout +and compares it with the intermediate outputs to automatically detect errors. +Then, by moving the located activations and making intra- and inter-map +adjustments, the rectification process can be performed with negligible +computational overhead. To evaluate SimM over a range of layout requirements, +we present a benchmark SimMBench that compensates for the lack of superlative +spatial relations in existing datasets. And both quantitative and qualitative +results demonstrate the effectiveness of the proposed SimM in calibrating the +layout inconsistencies. + +
+
+
+
+
+ + ☆ Side4Video: Spatial-Temporal Side Network for Memory-Efficient + Image-to-Video Transfer Learning + + +
+ Large pre-trained vision models achieve impressive success in computer +vision. However, fully fine-tuning large models for downstream tasks, +particularly in video understanding, can be prohibitively computationally +expensive. Recent studies turn their focus towards efficient image-to-video +transfer learning. Nevertheless, existing efficient fine-tuning methods lack +attention to training memory usage and exploration of transferring a larger +model to the video domain. In this paper, we present a novel Spatial-Temporal +Side Network for memory-efficient fine-tuning large image models to video +understanding, named Side4Video. Specifically, we introduce a lightweight +spatial-temporal side network attached to the frozen vision model, which avoids +the backpropagation through the heavy pre-trained model and utilizes +multi-level spatial features from the original image model. Extremely +memory-efficient architecture enables our method to reduce 75% memory usage +than previous adapter-based methods. In this way, we can transfer a huge ViT-E +(4.4B) for video understanding tasks which is 14x larger than ViT-L (304M). Our +approach achieves remarkable performance on various video datasets across +unimodal and cross-modal tasks (i.e., action recognition and text-video +retrieval), especially in Something-Something V1&V2 (67.3% & 74.6%), +Kinetics-400 (88.6%), MSR-VTT (52.3%), MSVD (56.1%) and VATEX (68.8%). We +release our code at https://github.com/HJYao00/Side4Video. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Towards Vision Enhancing LLMs: Empowering Multimodal Knowledge Storage + and Sharing in LLMs + + +
+ Recent advancements in multimodal large language models (MLLMs) have achieved +significant multimodal generation capabilities, akin to GPT-4. These models +predominantly map visual information into language representation space, +leveraging the vast knowledge and powerful text generation abilities of LLMs to +produce multimodal instruction-following responses. We could term this method +as LLMs for Vision because of its employing LLMs for visual-language +understanding, yet observe that these MLLMs neglect the potential of harnessing +visual knowledge to enhance overall capabilities of LLMs, which could be +regraded as Vision Enhancing LLMs. In this paper, we propose an approach called +MKS2, aimed at enhancing LLMs through empowering Multimodal Knowledge Storage +and Sharing in LLMs. Specifically, we introduce the Modular Visual Memory, a +component integrated into the internal blocks of LLMs, designed to store +open-world visual information efficiently. Additionally, we present a soft +Mixtures-of-Multimodal Experts architecture in LLMs to invoke multimodal +knowledge collaboration during generation. Our comprehensive experiments +demonstrate that MKS2 substantially augments the reasoning capabilities of LLMs +in contexts necessitating physical or commonsense knowledge. It also delivers +competitive results on multimodal benchmarks. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ☆ PyNanospacing: TEM image processing tool for strain analysis and + visualization + + +
+ The diverse spectrum of material characteristics including band gap, +mechanical moduli, color, phonon and electronic density of states, along with +catalytic and surface properties are intricately intertwined with the atomic +structure and the corresponding interatomic bond-lengths. This interconnection +extends to the manifestation of interplanar spacings within a crystalline +lattice. Analysis of these interplanar spacings and the comprehension of any +deviations, whether it be lattice compression or expansion, commonly referred +to as strain, hold paramount significance in unraveling various unknowns within +the field. Transmission Electron Microscopy (TEM) is widely used to capture +atomic-scale ordering, facilitating direct investigation of interplanar +spacings. However, creating critical contour maps for visualizing and +interpreting lattice stresses in TEM images remains a challenging task. Here we +developed a Python code for TEM image processing that can handle a wide range +of materials including nanoparticles, 2D materials, pure crystals and solid +solutions. This algorithm converts local differences in interplanar spacings +into contour maps allowing for a visual representation of lattice expansion and +compression. The tool is very generic and can significantly aid in analyzing +material properties using TEM images, allowing for a more in-depth exploration +of the underlying science behind strain engineering via strain contour maps at +the atomic level. + +
+
+ comment: Preprint, 13 pages, 9 figures +
+
+
+
+
+ + ☆ One More Step: A Versatile Plug-and-Play Module for Rectifying Diffusion + Schedule Flaws and Enhancing Low-Frequency Controls + + +
+ It is well known that many open-released foundational diffusion models have +difficulty in generating images that substantially depart from average +brightness, despite such images being present in the training data. This is due +to an inconsistency: while denoising starts from pure Gaussian noise during +inference, the training noise schedule retains residual data even in the final +timestep distribution, due to difficulties in numerical conditioning in +mainstream formulation, leading to unintended bias during inference. To +mitigate this issue, certain $\epsilon$-prediction models are combined with an +ad-hoc offset-noise methodology. In parallel, some contemporary models have +adopted zero-terminal SNR noise schedules together with +$\mathbf{v}$-prediction, which necessitate major alterations to pre-trained +models. However, such changes risk destabilizing a large multitude of +community-driven applications anchored on these pre-trained models. In light of +this, our investigation revisits the fundamental causes, leading to our +proposal of an innovative and principled remedy, called One More Step (OMS). By +integrating a compact network and incorporating an additional simple yet +effective step during inference, OMS elevates image fidelity and harmonizes the +dichotomy between training and inference, while preserving original model +parameters. Once trained, various pre-trained diffusion models with the same +latent domain can share the same OMS module. + +
+
+ comment: Project Page: https://jabir-zheng.github.io/OneMoreStep/, Demo Page: + https://huggingface.co/spaces/h1t/oms_sdxl_lcm +
+
+
+
+
+ + ☆ Machine Learning-Based Jamun Leaf Disease Detection: A Comprehensive + Review + + +
+ Jamun leaf diseases pose a significant threat to agricultural productivity, +negatively impacting both yield and quality in the jamun industry. The advent +of machine learning has opened up new avenues for tackling these diseases +effectively. Early detection and diagnosis are essential for successful crop +management. While no automated systems have yet been developed specifically for +jamun leaf disease detection, various automated systems have been implemented +for similar types of disease detection using image processing techniques. This +paper presents a comprehensive review of machine learning methodologies +employed for diagnosing plant leaf diseases through image classification, which +can be adapted for jamun leaf disease detection. It meticulously assesses the +strengths and limitations of various Vision Transformer models, including +Transfer learning model and vision transformer (TLMViT), SLViT, SE-ViT, +IterationViT, Tiny-LeViT, IEM-ViT, GreenViT, and PMViT. Additionally, the paper +reviews models such as Dense Convolutional Network (DenseNet), Residual Neural +Network (ResNet)-50V2, EfficientNet, Ensemble model, Convolutional Neural +Network (CNN), and Locally Reversible Transformer. These machine-learning +models have been evaluated on various datasets, demonstrating their real-world +applicability. This review not only sheds light on current advancements in the +field but also provides valuable insights for future research directions in +machine learning-based jamun leaf disease detection and classification. + +
+
+
+
+
+ + ☆ Optimization of Image Processing Algorithms for Character Recognition in + Cultural Typewritten Documents + + +
+ Linked Data is used in various fields as a new way of structuring and +connecting data. Cultural heritage institutions have been using linked data to +improve archival descriptions and facilitate the discovery of information. Most +archival records have digital representations of physical artifacts in the form +of scanned images that are non-machine-readable. Optical Character Recognition +(OCR) recognizes text in images and translates it into machine-encoded text. +This paper evaluates the impact of image processing methods and parameter +tuning in OCR applied to typewritten cultural heritage documents. The approach +uses a multi-objective problem formulation to minimize Levenshtein edit +distance and maximize the number of words correctly identified with a +non-dominated sorting genetic algorithm (NSGA-II) to tune the methods' +parameters. Evaluation results show that parameterization by digital +representation typology benefits the performance of image pre-processing +algorithms in OCR. Furthermore, our findings suggest that employing image +pre-processing algorithms in OCR might be more suitable for typologies where +the text recognition task without pre-processing does not produce good results. +In particular, Adaptive Thresholding, Bilateral Filter, and Opening are the +best-performing algorithms for the theatre plays' covers, letters, and overall +dataset, respectively, and should be applied before OCR to improve its +performance. + +
+
+ comment: 25 pages, 4 figures +
+
+
+
+
+ + ☆ GPT4Vis: What Can GPT-4 Do for Zero-shot Visual Recognition? + + +
+ This paper does not present a novel method. Instead, it delves into an +essential, yet must-know baseline in light of the latest advancements in +Generative Artificial Intelligence (GenAI): the utilization of GPT-4 for visual +understanding. Our study centers on the evaluation of GPT-4's linguistic and +visual capabilities in zero-shot visual recognition tasks. Specifically, we +explore the potential of its generated rich textual descriptions across various +categories to enhance recognition performance without any training. +Additionally, we evaluate its visual proficiency in directly recognizing +diverse visual content. To achieve this, we conduct an extensive series of +experiments, systematically quantifying the performance of GPT-4 across three +modalities: images, videos, and point clouds. This comprehensive evaluation +encompasses a total of 16 widely recognized benchmark datasets, providing top-1 +and top-5 accuracy metrics. Our study reveals that leveraging GPT-4's advanced +linguistic knowledge to generate rich descriptions markedly improves zero-shot +recognition. In terms of visual proficiency, GPT-4V's average performance +across 16 datasets sits roughly between the capabilities of OpenAI-CLIP's ViT-L +and EVA-CLIP's ViT-E. We hope that this research will contribute valuable data +points and experience for future studies. We release our code at +https://github.com/whwu95/GPT4Vis. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ Adinkra Symbol Recognition using Classical Machine Learning and Deep + Learning + + +
+ Artificial intelligence (AI) has emerged as a transformative influence, +engendering paradigm shifts in global societies, spanning academia and +industry. However, in light of these rapid advances, addressing the +underrepresentation of black communities and African countries in AI is +crucial. Boosting enthusiasm for AI can be effectively accomplished by +showcasing straightforward applications around tasks like identifying and +categorizing traditional symbols, such as Adinkra symbols, or familiar objects +within the community. In this research endeavor, we dived into classical +machine learning and harnessed the power of deep learning models to tackle the +intricate task of classifying and recognizing Adinkra symbols. The idea led to +a newly constructed ADINKRA dataset comprising 174,338 images meticulously +organized into 62 distinct classes, each representing a singular and emblematic +symbol. We constructed a CNN model for classification and recognition using six +convolutional layers, three fully connected (FC) layers, and optional dropout +regularization. The model is a simpler and smaller version of VGG, with fewer +layers, smaller channel sizes, and a fixed kernel size. Additionally, we tap +into the transfer learning capabilities provided by pre-trained models like VGG +and ResNet. These models assist us in both classifying images and extracting +features that can be used with classical machine learning models. We assess the +model's performance by measuring its accuracy and convergence rate and +visualizing the areas that significantly influence its predictions. These +evaluations serve as a foundational benchmark for future assessments of the +ADINKRA dataset. We hope this application exemplar inspires ideas on the +various uses of AI in organizing our traditional and modern lives. + +
+
+ comment: 15 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ MARIS: Referring Image Segmentation via Mutual-Aware Attention Features + + +
+ Referring image segmentation (RIS) aims to segment a particular region based +on a language expression prompt. Existing methods incorporate linguistic +features into visual features and obtain multi-modal features for mask +decoding. However, these methods may segment the visually salient entity +instead of the correct referring region, as the multi-modal features are +dominated by the abundant visual context. In this paper, we propose MARIS, a +referring image segmentation method that leverages the Segment Anything Model +(SAM) and introduces a mutual-aware attention mechanism to enhance the +cross-modal fusion via two parallel branches. Specifically, our mutual-aware +attention mechanism consists of Vision-Guided Attention and Language-Guided +Attention, which bidirectionally model the relationship between visual and +linguistic features. Correspondingly, we design a Mask Decoder to enable +explicit linguistic guidance for more consistent segmentation with the language +expression. To this end, a multi-modal query token is proposed to integrate +linguistic information and interact with visual information simultaneously. +Extensive experiments on three benchmark datasets show that our method +outperforms the state-of-the-art RIS methods. Our code will be publicly +available. + +
+
+
+
+
+ + ☆ GLIME: General, Stable and Local LIME Explanation NeurIPS 2023 + + +
+ As black-box machine learning models grow in complexity and find applications +in high-stakes scenarios, it is imperative to provide explanations for their +predictions. Although Local Interpretable Model-agnostic Explanations (LIME) +[22] is a widely adpoted method for understanding model behaviors, it is +unstable with respect to random seeds [35,24,3] and exhibits low local fidelity +(i.e., how well the explanation approximates the model's local behaviors) +[21,16]. Our study shows that this instability problem stems from small sample +weights, leading to the dominance of regularization and slow convergence. +Additionally, LIME's sampling neighborhood is non-local and biased towards the +reference, resulting in poor local fidelity and sensitivity to reference +choice. To tackle these challenges, we introduce GLIME, an enhanced framework +extending LIME and unifying several prior methods. Within the GLIME framework, +we derive an equivalent formulation of LIME that achieves significantly faster +convergence and improved stability. By employing a local and unbiased sampling +distribution, GLIME generates explanations with higher local fidelity compared +to LIME. GLIME explanations are independent of reference choice. Moreover, +GLIME offers users the flexibility to choose a sampling distribution based on +their specific scenarios. + +
+
+ comment: Accepted by NeurIPS 2023 as a Spotlight paper +
+
+
+
+
+ + ☆ Variational Autoencoders for Feature Exploration and Malignancy + Prediction of Lung Lesions BMVC 2023 + + +
+ Lung cancer is responsible for 21% of cancer deaths in the UK and five-year +survival rates are heavily influenced by the stage the cancer was identified +at. Recent studies have demonstrated the capability of AI methods for accurate +and early diagnosis of lung cancer from routine scans. However, this evidence +has not translated into clinical practice with one barrier being a lack of +interpretable models. This study investigates the application Variational +Autoencoders (VAEs), a type of generative AI model, to lung cancer lesions. +Proposed models were trained on lesions extracted from 3D CT scans in the +LIDC-IDRI public dataset. Latent vector representations of 2D slices produced +by the VAEs were explored through clustering to justify their quality and used +in an MLP classifier model for lung cancer diagnosis, the best model achieved +state-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows +the VAE latent space separates the dataset of malignant and benign lesions +based on meaningful feature components including tumour size, shape, patient +and malignancy class. We also include a comparative analysis of the standard +Gaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces +the prior with a Dirichlet distribution to encourage a more explainable latent +space with disentangled feature representation. Finally, we demonstrate the +potential for latent space traversals corresponding to clinically meaningful +feature changes. + +
+
+ comment: 10 pages (main paper), 5 pages (references), 5 figures, 2 tables, + work accepted for BMVC 2023 +
+
+
+
+
+ + ☆ SAM-6D: Segment Anything Model Meets Zero-Shot 6D Object Pose Estimation + + +
+ Zero-shot 6D object pose estimation involves the detection of novel objects +with their 6D poses in cluttered scenes, presenting significant challenges for +model generalizability. Fortunately, the recent Segment Anything Model (SAM) +has showcased remarkable zero-shot transfer performance, which provides a +promising solution to tackle this task. Motivated by this, we introduce SAM-6D, +a novel framework designed to realize the task through two steps, including +instance segmentation and pose estimation. Given the target objects, SAM-6D +employs two dedicated sub-networks, namely Instance Segmentation Model (ISM) +and Pose Estimation Model (PEM), to perform these steps on cluttered RGB-D +images. ISM takes SAM as an advanced starting point to generate all possible +object proposals and selectively preserves valid ones through meticulously +crafted object matching scores in terms of semantics, appearance and geometry. +By treating pose estimation as a partial-to-partial point matching problem, PEM +performs a two-stage point matching process featuring a novel design of +background tokens to construct dense 3D-3D correspondence, ultimately yielding +the pose estimates. Without bells and whistles, SAM-6D outperforms the existing +methods on the seven core datasets of the BOP Benchmark for both instance +segmentation and pose estimation of novel objects. + +
+
+ comment: Github Page: https://github.com/JiehongLin/SAM-6D +
+
+
+
+
+ + ☆ Model-agnostic Body Part Relevance Assessment for Pedestrian Detection + + +
+ Model-agnostic explanation methods for deep learning models are flexible +regarding usability and availability. However, due to the fact that they can +only manipulate input to see changes in output, they suffer from weak +performance when used with complex model architectures. For models with large +inputs as, for instance, in object detection, sampling-based methods like +KernelSHAP are inefficient due to many computation-heavy forward passes through +the model. In this work, we present a framework for using sampling-based +explanation models in a computer vision context by body part relevance +assessment for pedestrian detection. Furthermore, we introduce a novel +sampling-based method similar to KernelSHAP that shows more robustness for +lower sampling sizes and, thus, is more efficient for explainability analyses +on large-scale datasets. + +
+
+
+
+
+ + ☆ HAVE-FUN: Human Avatar Reconstruction from Few-Shot Unconstrained Images + + +
+ As for human avatar reconstruction, contemporary techniques commonly +necessitate the acquisition of costly data and struggle to achieve satisfactory +results from a small number of casual images. In this paper, we investigate +this task from a few-shot unconstrained photo album. The reconstruction of +human avatars from such data sources is challenging because of limited data +amount and dynamic articulated poses. For handling dynamic data, we integrate a +skinning mechanism with deep marching tetrahedra (DMTet) to form a drivable +tetrahedral representation, which drives arbitrary mesh topologies generated by +the DMTet for the adaptation of unconstrained images. To effectively mine +instructive information from few-shot data, we devise a two-phase optimization +method with few-shot reference and few-shot guidance. The former focuses on +aligning avatar identity with reference images, while the latter aims to +generate plausible appearances for unseen regions. Overall, our framework, +called HaveFun, can undertake avatar reconstruction, rendering, and animation. +Extensive experiments on our developed benchmarks demonstrate that HaveFun +exhibits substantially superior performance in reconstructing the human body +and hand. Project website: https://seanchenxy.github.io/HaveFunWeb/. + +
+
+
+
+
+ + ☆ Deformation-Guided Unsupervised Non-Rigid Shape Matching + + +
+ We present an unsupervised data-driven approach for non-rigid shape matching. +Shape matching identifies correspondences between two shapes and is a +fundamental step in many computer vision and graphics applications. Our +approach is designed to be particularly robust when matching shapes digitized +using 3D scanners that contain fine geometric detail and suffer from different +types of noise including topological noise caused by the coalescence of +spatially close surface regions. We build on two strategies. First, using a +hierarchical patch based shape representation we match shapes consistently in a +coarse to fine manner, allowing for robustness to noise. This multi-scale +representation drastically reduces the dimensionality of the problem when +matching at the coarsest scale, rendering unsupervised learning feasible. +Second, we constrain this hierarchical matching to be reflected in 3D by +fitting a patch-wise near-rigid deformation model. Using this constraint, we +leverage spatial continuity at different scales to capture global shape +properties, resulting in matchings that generalize well to data with different +deformations and noise characteristics. Experiments demonstrate that our +approach obtains significantly better results on raw 3D scans than +state-of-the-art methods, while performing on-par on standard test scenarios. + +
+
+
+
+
+ + ☆ Technical Report for Argoverse Challenges on 4D Occupancy Forecasting + + +
+ This report presents our Le3DE2E_Occ solution for 4D Occupancy Forecasting in +Argoverse Challenges at CVPR 2023 Workshop on Autonomous Driving (WAD). Our +solution consists of a strong LiDAR-based Bird's Eye View (BEV) encoder with +temporal fusion and a two-stage decoder, which combines a DETR head and a UNet +decoder. The solution was tested on the Argoverse 2 sensor dataset to evaluate +the occupancy state 3 seconds in the future. Our solution achieved 18% lower L1 +Error (3.57) than the baseline and got the 1 place on the 4D Occupancy +Forecasting task in Argoverse Challenges at CVPR 2023. + +
+
+
+
+
+ + ☆ Regularization by Texts for Latent Diffusion Inverse Solvers + + +
+ The recent advent of diffusion models has led to significant progress in +solving inverse problems, leveraging these models as effective generative +priors. Nonetheless, challenges related to the ill-posed nature of such +problems remain, often due to inherent ambiguities in measurements. Drawing +inspiration from the human ability to resolve visual ambiguities through +perceptual biases, here we introduce a novel latent diffusion inverse solver by +incorporating regularization by texts (TReg). Specifically, TReg applies the +textual description of the preconception of the solution during the reverse +sampling phase, of which description isndynamically reinforced through +null-text optimization for adaptive negation. Our comprehensive experimental +results demonstrate that TReg successfully mitigates ambiguity in latent +diffusion inverse solvers, enhancing their effectiveness and accuracy. + +
+
+
+
+
+ + ☆ Enhancing Diffusion Models with Text-Encoder Reinforcement Learning + + +
+ Text-to-image diffusion models are typically trained to optimize the +log-likelihood objective, which presents challenges in meeting specific +requirements for downstream tasks, such as image aesthetics and image-text +alignment. Recent research addresses this issue by refining the diffusion U-Net +using human rewards through reinforcement learning or direct backpropagation. +However, many of them overlook the importance of the text encoder, which is +typically pretrained and fixed during training. In this paper, we demonstrate +that by finetuning the text encoder through reinforcement learning, we can +enhance the text-image alignment of the results, thereby improving the visual +quality. Our primary motivation comes from the observation that the current +text encoder is suboptimal, often requiring careful prompt adjustment. While +fine-tuning the U-Net can partially improve performance, it remains suffering +from the suboptimal text encoder. Therefore, we propose to use reinforcement +learning with low-rank adaptation to finetune the text encoder based on +task-specific rewards, referred as \textbf{TexForce}. We first show that +finetuning the text encoder can improve the performance of diffusion models. +Then, we illustrate that TexForce can be simply combined with existing U-Net +finetuned models to get much better results without additional training. +Finally, we showcase the adaptability of our method in diverse applications, +including the generation of high-quality face and hand images. + +
+
+
+
+
+ + ☆ Reinforcement Learning from Diffusion Feedback: Q* for Image Search + + +
+ Large vision-language models are steadily gaining personalization +capabilities at the cost of fine-tuning or data augmentation. We present two +models for image generation using model-agnostic learning that align semantic +priors with generative capabilities. RLDF, or Reinforcement Learning from +Diffusion Feedback, is a singular approach for visual imitation through +prior-preserving reward function guidance. This employs Q-learning (with +standard Q*) for generation and follows a semantic-rewarded trajectory for +image search through finite encoding-tailored actions. The second proposed +method, noisy diffusion gradient, is optimization driven. At the root of both +methods is a special CFG encoding that we propose for continual semantic +guidance. Using only a single input image and no text input, RLDF generates +high-quality images over varied domains including retail, sports and +agriculture showcasing class-consistency and strong visual diversity. Project +website is available at https://infernolia.github.io/RLDF. + +
+
+
+
+
+ + ☆ PaintNeSF: Artistic Creation of Stylized Scenes with Vectorized 3D + Strokes + + +
+ We present Paint Neural Stroke Field (PaintNeSF), a novel technique to +generate stylized images of a 3D scene at arbitrary novel views from multi-view +2D images. Different from existing methods which apply stylization to trained +neural radiance fields at the voxel level, our approach draws inspiration from +image-to-painting methods, simulating the progressive painting process of human +artwork with vector strokes. We develop a palette of stylized 3D strokes from +basic primitives and splines, and consider the 3D scene stylization task as a +multi-view reconstruction process based on these 3D stroke primitives. Instead +of directly searching for the parameters of these 3D strokes, which would be +too costly, we introduce a differentiable renderer that allows optimizing +stroke parameters using gradient descent, and propose a training scheme to +alleviate the vanishing gradient issue. The extensive evaluation demonstrates +that our approach effectively synthesizes 3D scenes with significant geometric +and aesthetic stylization while maintaining a consistent appearance across +different views. Our method can be further integrated with style loss and +image-text contrastive models to extend its applications, including color +transfer and text-driven 3D scene drawing. + +
+
+
+
+
+ + ☆ Only Positive Cases: 5-fold High-order Attention Interaction Model for + Skin Segmentation Derived Classification + + +
+ Computer-aided diagnosis of skin diseases is an important tool. However, the +interpretability of computer-aided diagnosis is currently poor. Dermatologists +and patients cannot intuitively understand the learning and prediction process +of neural networks, which will lead to a decrease in the credibility of +computer-aided diagnosis. In addition, traditional methods need to be trained +using negative samples in order to predict the presence or absence of a lesion, +but medical data is often in short supply. In this paper, we propose a multiple +high-order attention interaction model (MHA-UNet) for use in a highly +explainable skin lesion segmentation task. MHA-UNet is able to obtain the +presence or absence of a lesion by explainable reasoning without the need for +training on negative samples. Specifically, we propose a high-order attention +interaction mechanism that introduces squeeze attention to a higher level for +feature attention. In addition, a multiple high-order attention interaction +(MHAblock) module is proposed by combining the different features of different +orders. For classifying the presence or absence of lesions, we conducted +classification experiments on several publicly available datasets in the +absence of negative samples, based on explainable reasoning about the +interaction of 5 attention orders of MHAblock. The highest positive detection +rate obtained from the experiments was 81.0% and the highest negative detection +rate was 83.5%. For segmentation experiments, comparison experiments of the +proposed method with 13 medical segmentation models and external validation +experiments with 8 state-of-the-art models in three public datasets and our +clinical dataset demonstrate the state-of-the-art performance of our model. The +code is available from https://github.com/wurenkai/MHA-UNet. + +
+
+
+
+
+ + ☆ Align before Adapt: Leveraging Entity-to-Region Alignments for + Generalizable Video Action Recognition + + +
+ Large-scale visual-language pre-trained models have achieved significant +success in various video tasks. However, most existing methods follow an "adapt +then align" paradigm, which adapts pre-trained image encoders to model +video-level representations and utilizes one-hot or text embedding of the +action labels for supervision. This paradigm overlooks the challenge of mapping +from static images to complicated activity concepts. In this paper, we propose +a novel "Align before Adapt" (ALT) paradigm. Prior to adapting to video +representation learning, we exploit the entity-to-region alignments for each +frame. The alignments are fulfilled by matching the region-aware image +embeddings to an offline-constructed text corpus. With the aligned entities, we +feed their text embeddings to a transformer-based video adapter as the queries, +which can help extract the semantics of the most important entities from a +video to a vector. This paradigm reuses the visual-language alignment of VLP +during adaptation and tries to explain an action by the underlying entities. +This helps understand actions by bridging the gap with complex activity +semantics, particularly when facing unfamiliar or unseen categories. ALT +achieves competitive performance and superior generalizability while requiring +significantly low computational costs. In fully supervised scenarios, it +achieves 88.1% top-1 accuracy on Kinetics-400 with only 4947 GFLOPs. In 2-shot +experiments, ALT outperforms the previous state-of-the-art by 7.1% and 9.2% on +HMDB-51 and UCF-101, respectively. + +
+
+
+
+
+ + ☆ Technical Report for Argoverse Challenges on Unified Sensor-based + Detection, Tracking, and Forecasting + + +
+ This report presents our Le3DE2E solution for unified sensor-based detection, +tracking, and forecasting in Argoverse Challenges at CVPR 2023 Workshop on +Autonomous Driving (WAD). We propose a unified network that incorporates three +tasks, including detection, tracking, and forecasting. This solution adopts a +strong Bird's Eye View (BEV) encoder with spatial and temporal fusion and +generates unified representations for multi-tasks. The solution was tested in +the Argoverse 2 sensor dataset to evaluate the detection, tracking, and +forecasting of 26 object categories. We achieved 1st place in Detection, +Tracking, and Forecasting on the E2E Forecasting track in Argoverse Challenges +at CVPR 2023 WAD. + +
+
+
+
+
+ + ☆ A manometric feature descriptor with linear-SVM to distinguish + esophageal contraction vigor + + +
+ n clinical, if a patient presents with nonmechanical obstructive dysphagia, +esophageal chest pain, and gastro esophageal reflux symptoms, the physician +will usually assess the esophageal dynamic function. High-resolution manometry +(HRM) is a clinically commonly used technique for detection of esophageal +dynamic function comprehensively and objectively. However, after the results of +HRM are obtained, doctors still need to evaluate by a variety of parameters. +This work is burdensome, and the process is complex. We conducted image +processing of HRM to predict the esophageal contraction vigor for assisting the +evaluation of esophageal dynamic function. Firstly, we used Feature-Extraction +and Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow +(PoS) to further extract higher-order features. Then we determine the +classification of esophageal contraction vigor normal, weak and failed by using +linear-SVM according to these features. Our data set includes 3000 training +sets, 500 validation sets and 411 test sets. After verification our accuracy +reaches 86.83%, which is higher than other common machine learning methods. + +
+
+
+
+
+ + ☆ Spatially Covariant Image Registration with Text Prompts + + +
+ Medical images are often characterized by their structured anatomical +representations and spatially inhomogeneous contrasts. Leveraging anatomical +priors in neural networks can greatly enhance their utility in +resource-constrained clinical settings. Prior research has harnessed such +information for image segmentation, yet progress in deformable image +registration has been modest. Our work introduces textSCF, a novel method that +integrates spatially covariant filters and textual anatomical prompts encoded +by visual-language models, to fill this gap. This approach optimizes an +implicit function that correlates text embeddings of anatomical regions to +filter weights, relaxing the typical translation-invariance constraint of +convolutional operations. TextSCF not only boosts computational efficiency but +can also retain or improve registration accuracy. By capturing the contextual +interplay between anatomical regions, it offers impressive inter-regional +transferability and the ability to preserve structural discontinuities during +registration. TextSCF's performance has been rigorously tested on inter-subject +brain MRI and abdominal CT registration tasks, outperforming existing +state-of-the-art models in the MICCAI Learn2Reg 2021 challenge and leading the +leaderboard. In abdominal registrations, textSCF's larger model variant +improved the Dice score by 11.3% over the second-best model, while its smaller +variant maintained similar accuracy but with an 89.13% reduction in network +parameters and a 98.34\% decrease in computational operations. + +
+
+ comment: 15 pages, 8 figures, 5 tables +
+
+
+
+
+ + ☆ 2D Feature Distillation for Weakly- and Semi-Supervised 3D Semantic + Segmentation WACV 2024 + + +
+ As 3D perception problems grow in popularity and the need for large-scale +labeled datasets for LiDAR semantic segmentation increase, new methods arise +that aim to reduce the necessity for dense annotations by employing +weakly-supervised training. However these methods continue to show weak +boundary estimation and high false negative rates for small objects and distant +sparse regions. We argue that such weaknesses can be compensated by using RGB +images which provide a denser representation of the scene. We propose an +image-guidance network (IGNet) which builds upon the idea of distilling high +level feature information from a domain adapted synthetically trained 2D +semantic segmentation network. We further utilize a one-way contrastive +learning scheme alongside a novel mixing strategy called FOVMix, to combat the +horizontal field-of-view mismatch between the two sensors and enhance the +effects of image guidance. IGNet achieves state-of-the-art results for +weakly-supervised LiDAR semantic segmentation on ScribbleKITTI, boasting up to +98% relative performance to fully supervised training with only 8% labeled +points, while introducing no additional annotation burden or +computational/memory cost during inference. Furthermore, we show that our +contributions also prove effective for semi-supervised training, where IGNet +claims state-of-the-art results on both ScribbleKITTI and SemanticKITTI. + +
+
+ comment: Accepted at WACV 2024 +
+
+
+
+
+ + ☆ UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio, + Video, Point Cloud, Time-Series and Image Recognition + + +
+ Large-kernel convolutional neural networks (ConvNets) have recently received +extensive research attention, but there are two unresolved and critical issues +that demand further investigation. 1) The architectures of existing +large-kernel ConvNets largely follow the design principles of conventional +ConvNets or transformers, while the architectural design for large-kernel +ConvNets remains under-addressed. 2) As transformers have dominated multiple +modalities, it remains to be investigated whether ConvNets also have a strong +universal perception ability in domains beyond vision. In this paper, we +contribute from two aspects. 1) We propose four architectural guidelines for +designing large-kernel ConvNets, the core of which is to exploit the essential +characteristics of large kernels that distinguish them from small kernels - +they can see wide without going deep. Following such guidelines, our proposed +large-kernel ConvNet shows leading performance in image recognition. For +example, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of +55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher +speed than a number of recently proposed powerful competitors. 2) We discover +that large kernels are the key to unlocking the exceptional performance of +ConvNets in domains where they were originally not proficient. With certain +modality-related preprocessing approaches, the proposed model achieves +state-of-the-art performance on time-series forecasting and audio recognition +tasks even without modality-specific customization to the architecture. Code +and all the models at https://github.com/AILab-CVC/UniRepLKNet. + +
+
+ comment: Code, all the models and reproducible training scripts at + https://github.com/AILab-CVC/UniRepLKNet +
+
+
+
+
+ + ☆ Can Vision-Language Models Think from a First-Person Perspective? + + +
+ Vision-language models (VLMs) have recently shown promising results in +traditional downstream tasks. Evaluation studies have emerged to assess their +abilities, with the majority focusing on the third-person perspective, and only +a few addressing specific tasks from the first-person perspective. However, the +capability of VLMs to "think" from a first-person perspective, a crucial +attribute for advancing autonomous agents and robotics, remains largely +unexplored. To bridge this research gap, we introduce EgoThink, a novel visual +question-answering benchmark that encompasses six core capabilities with twelve +detailed dimensions. The benchmark is constructed using selected clips from +egocentric videos, with manually annotated question-answer pairs containing +first-person information. To comprehensively assess VLMs, we evaluate eighteen +popular VLMs on EgoThink. Moreover, given the open-ended format of the answers, +we use GPT-4 as the automatic judge to compute single-answer grading. +Experimental results indicate that although GPT-4V leads in numerous +dimensions, all evaluated VLMs still possess considerable potential for +improvement in first-person perspective tasks. Meanwhile, enlarging the number +of trainable parameters has the most significant impact on model performance on +EgoThink. In conclusion, EgoThink serves as a valuable addition to existing +evaluation benchmarks for VLMs, providing an indispensable resource for future +research in the realm of embodied artificial intelligence and robotics. + +
+
+
+
+
+ + ☆ An Ensemble of 2.5D ResUnet Based Models for Segmentation for Kidney and + Masses + + +
+ The automatic segmentation of kidney, kidney tumor and kidney cyst on +Computed Tomography (CT) scans is a challenging task due to the indistinct +lesion boundaries and fuzzy texture. Considering the large range and unbalanced +distribution of CT scans' thickness, 2.5D ResUnet are adopted to build an +efficient coarse-to-fine semantic segmentation framework in this work. A set of +489 CT scans are used for training and validation, and an independent +never-before-used CT scans for testing. Finally, we demonstrate the +effectiveness of our proposed method. The dice values on test set are 0.954, +0.792, 0.691, the surface dice values are 0.897, 0.591, 0.541 for kidney, tumor +and cyst, respectively. The average inference time of each CT scan is 20.65s +and the max GPU memory is 3525MB. The results suggest that a better trade-off +between model performance and efficiency. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ A deep learning approach for marine snow synthesis and removal + + +
+ Marine snow, the floating particles in underwater images, severely degrades +the visibility and performance of human and machine vision systems. This paper +proposes a novel method to reduce the marine snow interference using deep +learning techniques. We first synthesize realistic marine snow samples by +training a Generative Adversarial Network (GAN) model and combine them with +natural underwater images to create a paired dataset. We then train a U-Net +model to perform marine snow removal as an image to image translation task. Our +experiments show that the U-Net model can effectively remove both synthetic and +natural marine snow with high accuracy, outperforming state-of-the-art methods +such as the Median filter and its adaptive variant. We also demonstrate the +robustness of our method by testing it on the MSRB dataset, which contains +synthetic artifacts that our model has not seen during training. Our method is +a practical and efficient solution for enhancing underwater images affected by +marine snow. + +
+
+
+
+
+ + ☆ Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras + from Wide-Angle Monocular Video Recordings + + +
+ Eliminating time-consuming post-production processes and delivering +high-quality videos in today's fast-paced digital landscape are the key +advantages of real-time approaches. To address these needs, we present Real +Time GAZED: a real-time adaptation of the GAZED framework integrated with +CineFilter, a novel real-time camera trajectory stabilization approach. It +enables users to create professionally edited videos in real-time. Comparative +evaluations against baseline methods, including the non-real-time GAZED, +demonstrate that Real Time GAZED achieves similar editing results, ensuring +high-quality video output. Furthermore, a user study confirms the aesthetic +quality of the video edits produced by the Real Time GAZED approach. With these +advancements in real-time camera trajectory optimization and video editing +presented, the demand for immediate and dynamic content creation in industries +such as live broadcasting, sports coverage, news reporting, and social media +content creation can be met more efficiently. + +
+
+
+
+
+ + ☆ EucliDreamer: Fast and High-Quality Texturing for 3D Models with Stable + Diffusion Depth + + +
+ This paper presents a novel method to generate textures for 3D models given +text prompts and 3D meshes. Additional depth information is taken into account +to perform the Score Distillation Sampling (SDS) process [28] with depth +conditional Stable Diffusion [34]. We ran our model over the open-source +dataset Objaverse [7] and conducted a user study to compare the results with +those of various 3D texturing methods. We have shown that our model can +generate more satisfactory results and produce various art styles for the same +object. In addition, we achieved faster time when generating textures of +comparable quality. We also conduct thorough ablation studies of how different +factors may affect generation quality, including sampling steps, guidance +scale, negative prompts, data augmentation, elevation range, and alternatives +to SDS. + +
+
+
+
+
+ + ☆ Video-based Visible-Infrared Person Re-Identification with Auxiliary + Samples + + +
+ Visible-infrared person re-identification (VI-ReID) aims to match persons +captured by visible and infrared cameras, allowing person retrieval and +tracking in 24-hour surveillance systems. Previous methods focus on learning +from cross-modality person images in different cameras. However, temporal +information and single-camera samples tend to be neglected. To crack this nut, +in this paper, we first contribute a large-scale VI-ReID dataset named +BUPTCampus. Different from most existing VI-ReID datasets, it 1) collects +tracklets instead of images to introduce rich temporal information, 2) contains +pixel-aligned cross-modality sample pairs for better modality-invariant +learning, 3) provides one auxiliary set to help enhance the optimization, in +which each identity only appears in a single camera. Based on our constructed +dataset, we present a two-stream framework as baseline and apply Generative +Adversarial Network (GAN) to narrow the gap between the two modalities. To +exploit the advantages introduced by the auxiliary set, we propose a curriculum +learning based strategy to jointly learn from both primary and auxiliary sets. +Moreover, we design a novel temporal k-reciprocal re-ranking method to refine +the ranking list with fine-grained temporal correlation cues. Experimental +results demonstrate the effectiveness of the proposed methods. We also +reproduce 9 state-of-the-art image-based and video-based VI-ReID methods on +BUPTCampus and our methods show substantial superiority to them. The codes and +dataset are available at: https://github.com/dyhBUPT/BUPTCampus. + +
+
+ comment: Accepted by Transactions on Information Forensics & Security 2023 +
+
+
+
+
+ + ☆ UFDA: Universal Federated Domain Adaptation with Practical Assumptions AAAI2024 + + +
+ Conventional Federated Domain Adaptation (FDA) approaches usually demand an +abundance of assumptions, such as label set consistency, which makes them +significantly less feasible for real-world situations and introduces security +hazards. In this work, we propose a more practical scenario named Universal +Federated Domain Adaptation (UFDA). It only requires the black-box model and +the label set information of each source domain, while the label sets of +different source domains could be inconsistent and the target-domain label set +is totally blind. This relaxes the assumptions made by FDA, which are often +challenging to meet in real-world cases and diminish model security. To address +the UFDA scenario, we propose a corresponding framework called Hot-Learning +with Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain +shifts and category gaps problem by using one-hot outputs from the black-box +models of various source domains. Moreover, to better distinguish the shared +and unknown classes, we further present a cluster-level strategy named +Mutual-Voting Decision (MVD) to extract robust consensus knowledge across peer +classes from both source and target domains. The extensive experiments on three +benchmarks demonstrate that our HCLD achieves comparable performance for our +UFDA scenario with much fewer assumptions, compared to the previous +methodologies with many additional assumptions. + +
+
+ comment: Submitted to AAAI2024 +
+
+
+
+
+ + ☆ Improving Adaptability and Generalizability of Efficient Transfer + Learning for Vision-Language Models + + +
+ Vision-Language Models (VLMs) like CLIP have demonstrated remarkable +applicability across a variety of downstream tasks, including zero-shot image +classification. Recently, the use of prompts or adapters for efficient transfer +learning has gained significant attention for effectively adapting to +downstream tasks. However, the roles of vision and text prompts, as well as +adapters in terms of generalization and transfer difficulty, have been +overlooked, limiting performance on unseen tasks. In this paper, we empirically +analyze how VLMs behave when using vision and text prompts, adapters, and a +combination of these components, marking a novel exploration by our study. Our +observations find that utilizing vision prompts for class separability and text +adapters for task adaptation is crucial for adaptability and generalizability. +Moreover, to improve generalization across every domain, we propose an adaptive +ensemble method that effectively combines the general knowledge of VLMs with +task-specific knowledge according to transfer difficulty. Upon experimenting +with extensive benchmarks, our method consistently outperforms all baselines, +particularly on unseen tasks, demonstrating the effectiveness of our proposed +approach. + +
+
+ comment: 11 pages (19 pages including supplementary), 10 figures (12 figures + including supplementary), 6 tables (17 tables including supplementary) +
+
+
+
+
+ + ☆ Fully Authentic Visual Question Answering Dataset from Online + Communities + + +
+ Visual Question Answering (VQA) entails answering questions about images. We +introduce the first VQA dataset in which all contents originate from an +authentic use case. Sourced from online question answering community forums, we +call it VQAonline. We then characterize our dataset and how it relates to eight +other VQA datasets. Observing that answers in our dataset tend to be much +longer (e.g., with a mean of 173 words) and thus incompatible with standard VQA +evaluation metrics, we next analyze which of the six popular metrics for longer +text evaluation align best with human judgments. We then use the best-suited +metrics to evaluate six state-of-the-art vision and language foundation models +on VQAonline and reveal where they struggle most. We will release the dataset +soon to facilitate future extensions. + +
+
+
+
+
+ + ☆ ET3D: Efficient Text-to-3D Generation via Multi-View Distillation + + +
+ Recent breakthroughs in text-to-image generation has shown encouraging +results via large generative models. Due to the scarcity of 3D assets, it is +hardly to transfer the success of text-to-image generation to that of +text-to-3D generation. Existing text-to-3D generation methods usually adopt the +paradigm of DreamFusion, which conducts per-asset optimization by distilling a +pretrained text-to-image diffusion model. The generation speed usually ranges +from several minutes to tens of minutes per 3D asset, which degrades the user +experience and also imposes a burden to the service providers due to the high +computational budget. + In this work, we present an efficient text-to-3D generation method, which +requires only around 8 $ms$ to generate a 3D asset given the text prompt on a +consumer graphic card. The main insight is that we exploit the images generated +by a large pre-trained text-to-image diffusion model, to supervise the training +of a text conditioned 3D generative adversarial network. Once the network is +trained, we are able to efficiently generate a 3D asset via a single forward +pass. Our method requires no 3D training data and provides an alternative +approach for efficient text-to-3D generation by distilling pre-trained image +diffusion models. + +
+
+
+
+
+ + ☆ PKU-I2IQA: An Image-to-Image Quality Assessment Database for AI + Generated Images + + +
+ With the development of image generation technology, AI-based image +generation has been applied in various fields. However, the development of AIGC +image generative models also brings new problems and challenges. A significant +challenge is that AI-generated images (AIGI) compared to natural images may +have some unique distortions, and not all generated images meet the +requirements of the real world, so it is of great significance to evaluate +AI-generated images more comprehensively. Although previous work has +established some human perception-based AIGC image quality assessment databases +for text-generated images, the AI image generation technology includes +scenarios like text-to-image and image-to-image, and assessing only the images +generated by text-to-image models is insufficient. To address this issue, we +have established a human perception-based image-to-image AIGC image quality +assessment database, named PKU-I2IQA. We conducted a comprehensive analysis of +the PKU-I2IQA database. Furthermore, we introduced two benchmark models: +NR-AIGCIQA based on no-reference image quality assessment and FR-AIGCIQA based +on full-reference image quality assessment.Finally, leveraging this database, +we conducted benchmark experiments and compared the performance of the proposed +benchmark models. The PKU-I2IQA database and benchmarks will be released to +facilitate future research on https://github.com/jiquan123/I2IQA. + Keywords: AIGC, image-to-image generation, image quality assessment, +NR-AIGCIQA, FR-AIGCIQA + +
+
+ comment: 18 pages +
+
+
+
+
+ + ☆ Instruct2Attack: Language-Guided Semantic Adversarial Attacks + + +
+ We propose Instruct2Attack (I2A), a language-guided semantic attack that +generates semantically meaningful perturbations according to free-form language +instructions. We make use of state-of-the-art latent diffusion models, where we +adversarially guide the reverse diffusion process to search for an adversarial +latent code conditioned on the input image and text instruction. Compared to +existing noise-based and semantic attacks, I2A generates more natural and +diverse adversarial examples while providing better controllability and +interpretability. We further automate the attack process with GPT-4 to generate +diverse image-specific text instructions. We show that I2A can successfully +break state-of-the-art deep neural networks even under strong adversarial +defenses, and demonstrate great transferability among a variety of network +architectures. + +
+
+ comment: under submission, code coming soon +
+
+
+
+
+ + ☆ Dataset Distillation in Latent Space + + +
+ Dataset distillation (DD) is a newly emerging research area aiming at +alleviating the heavy computational load in training models on large datasets. +It tries to distill a large dataset into a small and condensed one so that +models trained on the distilled dataset can perform comparably with those +trained on the full dataset when performing downstream tasks. Among the +previous works in this area, there are three key problems that hinder the +performance and availability of the existing DD methods: high time complexity, +high space complexity, and low info-compactness. In this work, we +simultaneously attempt to settle these three problems by moving the DD +processes from conventionally used pixel space to latent space. Encoded by a +pretrained generic autoencoder, latent codes in the latent space are naturally +info-compact representations of the original images in much smaller sizes. +After transferring three mainstream DD algorithms to latent space, we +significantly reduce time and space consumption while achieving similar +performance, allowing us to distill high-resolution datasets or target at +greater data ratio that previous methods have failed. Besides, within the same +storage budget, we can also quantitatively deliver more latent codes than +pixel-level images, which further boosts the performance of our methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Beyond Pixels: Exploring Human-Readable SVG Generation for Simple Images + with Vision Language Models + + +
+ In the field of computer graphics, the use of vector graphics, particularly +Scalable Vector Graphics (SVG), represents a notable development from +traditional pixel-based imagery. SVGs, with their XML-based format, are +distinct in their ability to directly and explicitly represent visual elements +such as shape, color, and path. This direct representation facilitates a more +accurate and logical depiction of graphical elements, enhancing reasoning and +interpretability. Recognizing the potential of SVGs, the machine learning +community has introduced multiple methods for image vectorization. However, +transforming images into SVG format while retaining the relational properties +and context of the original scene remains a key challenge. Most vectorization +methods often yield SVGs that are overly complex and not easily interpretable. +In response to this challenge, we introduce our method, Simple-SVG-Generation +(S\textsuperscript{2}VG\textsuperscript{2}). Our method focuses on producing +SVGs that are both accurate and simple, aligning with human readability and +understanding. With simple images, we evaluate our method with reasoning tasks +together with advanced language models, the results show a clear improvement +over previous SVG generation methods. We also conducted surveys for human +evaluation on the readability of our generated SVGs, the results also favor our +methods. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ EAFP-Med: An Efficient Adaptive Feature Processing Module Based on + Prompts for Medical Image Detection + + +
+ In the face of rapid advances in medical imaging, cross-domain adaptive +medical image detection is challenging due to the differences in lesion +representations across various medical imaging technologies. To address this +issue, we draw inspiration from large language models to propose EAFP-Med, an +efficient adaptive feature processing module based on prompts for medical image +detection. EAFP-Med can efficiently extract lesion features of different scales +from a diverse range of medical images based on prompts while being flexible +and not limited by specific imaging techniques. Furthermore, it serves as a +feature preprocessing module that can be connected to any model front-end to +enhance the lesion features in input images. Moreover, we propose a novel +adaptive disease detection model named EAFP-Med ST, which utilizes the Swin +Transformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med. +We have compared our method to nine state-of-the-art methods. Experimental +results demonstrate that EAFP-Med ST achieves the best performance on all three +datasets (chest X-ray images, cranial magnetic resonance imaging images, and +skin images). EAFP-Med can efficiently extract lesion features from various +medical images based on prompts, enhancing the model's performance. This holds +significant potential for improving medical image analysis and diagnosis. + +
+
+
+
+
+ + ☆ SED: A Simple Encoder-Decoder for Open-Vocabulary Semantic Segmentation + + +
+ Open-vocabulary semantic segmentation strives to distinguish pixels into +different semantic groups from an open set of categories. Most existing methods +explore utilizing pre-trained vision-language models, in which the key is to +adopt the image-level model for pixel-level segmentation task. In this paper, +we propose a simple encoder-decoder, named SED, for open-vocabulary semantic +segmentation, which comprises a hierarchical encoder-based cost map generation +and a gradual fusion decoder with category early rejection. The hierarchical +encoder-based cost map generation employs hierarchical backbone, instead of +plain transformer, to predict pixel-level image-text cost map. Compared to +plain transformer, hierarchical backbone better captures local spatial +information and has linear computational complexity with respect to input size. +Our gradual fusion decoder employs a top-down structure to combine cost map and +the feature maps of different backbone levels for segmentation. To accelerate +inference speed, we introduce a category early rejection scheme in the decoder +that rejects many no-existing categories at the early layer of decoder, +resulting in at most 4.7 times acceleration without accuracy degradation. +Experiments are performed on multiple open-vocabulary semantic segmentation +datasets, which demonstrates the efficacy of our SED method. When using +ConvNeXt-B, our SED method achieves mIoU score of 31.6\% on ADE20K with 150 +categories at 82 millisecond ($ms$) per image on a single A6000. We will +release it at \url{https://github.com/xb534/SED.git}. + +
+
+
+
+
+ + ☆ SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume + Registration + + +
+ Background and Objective: The lack of benchmark datasets has impeded the +development of slice-to-volume registration algorithms. Such datasets are +difficult to annotate, primarily due to the dimensional difference within data +and the dearth of task-specific software. We aim to develop a user-friendly +tool to streamline dataset annotation for slice-to-volume registration. + Methods: The proposed tool, named SVRDA, is an installation-free web +application for platform-agnostic collaborative dataset annotation. It enables +efficient transformation manipulation via keyboard shortcuts and smooth case +transitions with auto-saving. SVRDA supports configuration-based data loading +and adheres to the separation of concerns, offering great flexibility and +extensibility for future research. Various supplementary features have been +implemented to facilitate slice-to-volume registration. + Results: We validated the effectiveness of SVRDA by indirectly evaluating the +post-registration segmentation quality on UK Biobank data, observing a dramatic +overall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in +the 95th percentile Hausdorff distance, respectively) supported by highly +statistically significant evidence ($p<0.001$).We further showcased the +clinical usage of SVRDA by integrating it into test-retest T1 quantification on +in-house magnetic resonance images, leading to more consistent results after +registration. + Conclusions: SVRDA can facilitate collaborative annotation of benchmark +datasets while being potentially applicable to other pipelines incorporating +slice-to-volume registration. Full source code and documentation are available +at https://github.com/Roldbach/SVRDA + +
+
+ comment: 18 pages, 11 figures, In submission to Computer Methods and Programs + in Biomedicine +
+
+
+
+
+ + ☆ Efficient Dataset Distillation via Minimax Diffusion + + +
+ Dataset distillation reduces the storage and computational consumption of +training a network by generating a small surrogate dataset that encapsulates +rich information of the original large-scale one. However, previous +distillation methods heavily rely on the sample-wise iterative optimization +scheme. As the images-per-class (IPC) setting or image resolution grows larger, +the necessary computation will demand overwhelming time and resources. In this +work, we intend to incorporate generative diffusion techniques for computing +the surrogate dataset. Observing that key factors for constructing an effective +surrogate dataset are representativeness and diversity, we design additional +minimax criteria in the generative training to enhance these facets for the +generated images of diffusion models. We present a theoretical model of the +process as hierarchical diffusion control demonstrating the flexibility of the +diffusion process to target these criteria without jeopardizing the +faithfulness of the sample to the desired distribution. The proposed method +achieves state-of-the-art validation performance while demanding much less +computational resources. Under the 100-IPC setting on ImageWoof, our method +requires less than one-twentieth the distillation time of previous methods, yet +yields even better performance. Source code available in +https://github.com/vimar-gu/MinimaxDiffusion. + +
+
+
+
+
+ + ☆ Sparse Pedestrian Character Learning for Trajectory Prediction + + +
+ Pedestrian trajectory prediction in a first-person view has recently +attracted much attention due to its importance in autonomous driving. Recent +work utilizes pedestrian character information, \textit{i.e.}, action and +appearance, to improve the learned trajectory embedding and achieves +state-of-the-art performance. However, it neglects the invalid and negative +pedestrian character information, which is harmful to trajectory representation +and thus leads to performance degradation. To address this issue, we present a +two-stream sparse-character-based network~(TSNet) for pedestrian trajectory +prediction. Specifically, TSNet learns the negative-removed characters in the +sparse character representation stream to improve the trajectory embedding +obtained in the trajectory representation stream. Moreover, to model the +negative-removed characters, we propose a novel sparse character graph, +including the sparse category and sparse temporal character graphs, to learn +the different effects of various characters in category and temporal +dimensions, respectively. Extensive experiments on two first-person view +datasets, PIE and JAAD, show that our method outperforms existing +state-of-the-art methods. In addition, ablation studies demonstrate different +effects of various characters and prove that TSNet outperforms approaches +without eliminating negative characters. + +
+
+
+
+
+ + ☆ CaesarNeRF: Calibrated Semantic Representation for Few-shot + Generalizable Neural Rendering + + +
+ Generalizability and few-shot learning are key challenges in Neural Radiance +Fields (NeRF), often due to the lack of a holistic understanding in pixel-level +rendering. We introduce CaesarNeRF, an end-to-end approach that leverages +scene-level CAlibratEd SemAntic Representation along with pixel-level +representations to advance few-shot, generalizable neural rendering, +facilitating a holistic understanding without compromising high-quality +details. CaesarNeRF explicitly models pose differences of reference views to +combine scene-level semantic representations, providing a calibrated holistic +understanding. This calibration process aligns various viewpoints with precise +location and is further enhanced by sequential refinement to capture varying +details. Extensive experiments on public datasets, including LLFF, Shiny, +mip-NeRF 360, and MVImgNet, show that CaesarNeRF delivers state-of-the-art +performance across varying numbers of reference views, proving effective even +with a single reference image. The project page of this work can be found at +https://haidongz-usc.github.io/project/caesarnerf. + +
+
+
+
+
+ + ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning + and Optimization Functions for Enhanced Precision + + +
+ Image registration has traditionally been done using two distinct approaches: +learning based methods, relying on robust deep neural networks, and +optimization-based methods, applying complex mathematical transformations to +warp images accordingly. Of course, both paradigms offer advantages and +disadvantages, and, in this work, we seek to combine their respective strengths +into a single streamlined framework, using the outputs of the learning based +method as initial parameters for optimization while prioritizing computational +power for the image pairs that offer the greatest loss. Our investigations +showed that an improvement of 0.3\% in testing when utilizing the best +performing state-of-the-art model as the backbone of the framework, while +maintaining the same inference time and with only a 0.8\% loss in deformation +field smoothness. + +
+
+
+
+
+ + ☆ AerialBooth: Mutual Information Guidance for Text Controlled Aerial View + Synthesis from a Single Image + + +
+ We present a novel method, AerialBooth, for synthesizing the aerial view from +a single input image using its text description. We leverage the pretrained +text-to-2D image stable diffusion model as prior knowledge of the 3D world. The +model is finetuned in two steps to optimize for the text embedding and the UNet +that reconstruct the input image and its inverse perspective mapping +respectively. The inverse perspective mapping creates variance within the +text-image space of the diffusion model, while providing weak guidance for +aerial view synthesis. At inference, we steer the contents of the generated +image towards the input image using novel mutual information guidance that +maximizes the information content between the probability distributions of the +two images. We evaluate our approach on a wide spectrum of real and synthetic +data, including natural scenes, indoor scenes, human action, etc. Through +extensive experiments and ablation studies, we demonstrate the effectiveness of +AerialBooth and also its generalizability to other text-controlled views. We +also show that AerialBooth achieves the best viewpoint-fidelity trade-off +though quantitative evaluation on 7 metrics analyzing viewpoint and fidelity +w.r.t. input image. Code and data is available at +https://github.com/divyakraman/AerialBooth2023. + +
+
+
+
+
+ + ☆ DreamCreature: Crafting Photorealistic Virtual Creatures from + Imagination + + +
+ Recent text-to-image (T2I) generative models allow for high-quality synthesis +following either text instructions or visual examples. Despite their +capabilities, these models face limitations in creating new, detailed creatures +within specific categories (e.g., virtual dog or bird species), which are +valuable in digital asset creation and biodiversity analysis. To bridge this +gap, we introduce a novel task, Virtual Creatures Generation: Given a set of +unlabeled images of the target concepts (e.g., 200 bird species), we aim to +train a T2I model capable of creating new, hybrid concepts within diverse +backgrounds and contexts. We propose a new method called DreamCreature, which +identifies and extracts the underlying sub-concepts (e.g., body parts of a +specific species) in an unsupervised manner. The T2I thus adapts to generate +novel concepts (e.g., new bird species) with faithful structures and +photorealistic appearance by seamlessly and flexibly composing learned +sub-concepts. To enhance sub-concept fidelity and disentanglement, we extend +the textual inversion technique by incorporating an additional projector and +tailored attention loss regularization. Extensive experiments on two +fine-grained image benchmarks demonstrate the superiority of DreamCreature over +prior methods in both qualitative and quantitative evaluation. Ultimately, the +learned sub-concepts facilitate diverse creative applications, including +innovative consumer product designs and nuanced property modifications. + +
+
+ comment: Website: https://github.com/kamwoh/dreamcreature +
+
+
+
+
+ + ☆ MeshGPT: Generating Triangle Meshes with Decoder-Only Transformers + + +
+ We introduce MeshGPT, a new approach for generating triangle meshes that +reflects the compactness typical of artist-created meshes, in contrast to dense +triangle meshes extracted by iso-surfacing methods from neural fields. Inspired +by recent advances in powerful large language models, we adopt a sequence-based +approach to autoregressively generate triangle meshes as sequences of +triangles. We first learn a vocabulary of latent quantized embeddings, using +graph convolutions, which inform these embeddings of the local mesh geometry +and topology. These embeddings are sequenced and decoded into triangles by a +decoder, ensuring that they can effectively reconstruct the mesh. A transformer +is then trained on this learned vocabulary to predict the index of the next +embedding given previous embeddings. Once trained, our model can be +autoregressively sampled to generate new triangle meshes, directly generating +compact meshes with sharp edges, more closely imitating the efficient +triangulation patterns of human-crafted meshes. MeshGPT demonstrates a notable +improvement over state of the art mesh generation methods, with a 9% increase +in shape coverage and a 30-point enhancement in FID scores across various +categories. + +
+
+ comment: Project Page: https://nihalsid.github.io/mesh-gpt/, Video: + https://youtu.be/UV90O1_69_o +
+
+
+
+
+ + ♻ ☆ FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from + Video Observations + + +
+ We present a generative approach to forecast long-term future human behavior +in 3D, requiring only weak supervision from readily available 2D human action +data. This is a fundamental task enabling many downstream applications. The +required ground-truth data is hard to capture in 3D (mocap suits, expensive +setups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our +method to only require 2D RGB data while being able to generate 3D human motion +sequences. We use a differentiable 2D projection scheme in an autoregressive +manner for weak supervision, and an adversarial loss for 3D regularization. Our +method predicts long and complex behavior sequences (e.g. cooking, assembly) +consisting of multiple sub-actions. We tackle this in a semantically +hierarchical manner, jointly predicting high-level coarse action labels +together with their low-level fine-grained realizations as characteristic 3D +human poses. We observe that these two action representations are coupled in +nature, and joint prediction benefits both action and pose forecasting. Our +experiments demonstrate the complementary nature of joint action and 3D pose +prediction: our joint approach outperforms each task treated individually, +enables robust longer-term sequence prediction, and outperforms alternative +approaches to forecast actions and characteristic 3D poses. + +
+
+ comment: Project Page: https://future-human-3d.christian-diller.de/ Video: + https://www.youtube.com/watch?v=18du85YFXL0 +
+
+
+
+
+ + ♻ ☆ Self-Guided Diffusion Models CVPR 2023 + + +
+ Diffusion models have demonstrated remarkable progress in image generation +quality, especially when guidance is used to control the generative process. +However, guidance requires a large amount of image-annotation pairs for +training and is thus dependent on their availability, correctness and +unbiasedness. In this paper, we eliminate the need for such annotation by +instead leveraging the flexibility of self-supervision signals to design a +framework for self-guided diffusion models. By leveraging a feature extraction +function and a self-annotation function, our method provides guidance signals +at various image granularities: from the level of holistic images to object +boxes and even segmentation masks. Our experiments on single-label and +multi-label image datasets demonstrate that self-labeled guidance always +outperforms diffusion models without guidance and may even surpass guidance +based on ground-truth labels, especially on unbalanced data. When equipped with +self-supervised box or mask proposals, our method further generates visually +diverse yet semantically consistent images, without the need for any class, +box, or segment label annotation. Self-guided diffusion is simple, flexible and +expected to profit from deployment at scale. Source code will be at: +https://taohu.me/sgdm/ + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40\% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ UFOGen: You Forward Once Large Scale Text-to-Image Generation via + Diffusion GANs + + +
+ Text-to-image diffusion models have demonstrated remarkable capabilities in +transforming textual prompts into coherent images, yet the computational cost +of their inference remains a persistent challenge. To address this issue, we +present UFOGen, a novel generative model designed for ultra-fast, one-step +text-to-image synthesis. In contrast to conventional approaches that focus on +improving samplers or employing distillation techniques for diffusion models, +UFOGen adopts a hybrid methodology, integrating diffusion models with a GAN +objective. Leveraging a newly introduced diffusion-GAN objective and +initialization with pre-trained diffusion models, UFOGen excels in efficiently +generating high-quality images conditioned on textual descriptions in a single +step. Beyond traditional text-to-image generation, UFOGen showcases versatility +in applications. Notably, UFOGen stands among the pioneering models enabling +one-step text-to-image generation and diverse downstream tasks, presenting a +significant advancement in the landscape of efficient generative models. + +
+
+
+
+
+ + ♻ ☆ AST: Effective Dataset Distillation through Alignment with Smooth and + High-Quality Expert Trajectories + + +
+ Training large AI models typically requires large-scale datasets in the +machine learning process, making training and parameter-tuning process both +time-consuming and costly. Some researchers address this problem by carefully +synthesizing a very small number of highly representative and informative +samples from real-world datasets. This approach, known as Dataset Distillation +(DD), proposes a perspective for data-efficient learning. Despite recent +progress in this field, the performance of existing methods still cannot meet +expectations, and distilled datasets cannot effectively replace original +datasets. In this paper, unlike previous methods that focus solely on improving +the effectiveness of student distillation, we recognize and leverage the +important mutual influence between expert and student models. We observed that +the smoothness of expert trajectories has a significant impact on subsequent +student parameter alignment. Based on this, we propose an effective DD +framework named AST, standing for Alignment with Smooth and high-quality expert +Trajectories. We devise the integration of clipping loss and gradient penalty +to regulate the rate of parameter changes in expert trajectory generation. To +further refine the student parameter alignment with expert trajectory, we put +forward representative initialization for the synthetic dataset and balanced +inner-loop loss in response to the sensitivity exhibited towards randomly +initialized variables during distillation. We also propose two enhancement +strategies, namely intermediate matching loss and weight perturbation, to +mitigate the potential occurrence of cumulative errors. We conduct extensive +experiments on datasets of different scales, sizes, and resolutions. The +results demonstrate that the proposed method significantly outperforms prior +methods. + +
+
+
+
+
+ + ♻ ☆ Applications of Large Scale Foundation Models for Autonomous Driving + + +
+ Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007, +autonomous driving has been the most active field of AI applications. Recently +powered by large language models (LLMs), chat systems, such as chatGPT and +PaLM, emerge and rapidly become a promising direction to achieve artificial +general intelligence (AGI) in natural language processing (NLP). There comes a +natural thinking that we could employ these abilities to reformulate autonomous +driving. By combining LLM with foundation models, it is possible to utilize the +human knowledge, commonsense and reasoning to rebuild autonomous driving +systems from the current long-tailed AI dilemma. In this paper, we investigate +the techniques of foundation models and LLMs applied for autonomous driving, +categorized as simulation, world model, data annotation and planning or E2E +solutions etc. + +
+
+ comment: 22 pages. arXiv admin note: text overlap with arXiv:2304.03589 by + other authors +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding + + +
+ As a vital step toward the intelligent agent, Action understanding matters +for intelligent agents and has attracted long-term attention. It can be formed +as the mapping from the action physical space to the semantic space. Typically, +researchers built action datasets according to idiosyncratic choices to define +classes and push the envelope of benchmarks respectively. Thus, datasets are +incompatible with each other like "Isolated Islands" due to semantic gaps and +various class granularities, e.g., do housework in dataset A and wash plate in +dataset B. We argue that a more principled semantic space is an urgent need to +concentrate the community efforts and enable us to use all datasets together to +pursue generalizable action learning. To this end, we design a structured +action semantic space in view of verb taxonomy hierarchy and covering massive +actions. By aligning the classes of previous datasets to our semantic space, we +gather (image/video/skeleton/MoCap) datasets into a unified database in a +unified label system, i.e., bridging ``isolated islands'' into a "Pangea". +Accordingly, we propose a novel model mapping from the physical space to +semantic space to fully use Pangea. In extensive experiments, our new system +shows significant superiority, especially in transfer learning. Code and data +will be made publicly available. + +
+
+ comment: Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ ENIGMA-51: Towards a Fine-Grained Understanding of Human-Object + Interactions in Industrial Scenarios + + +
+ ENIGMA-51 is a new egocentric dataset acquired in an industrial scenario by +19 subjects who followed instructions to complete the repair of electrical +boards using industrial tools (e.g., electric screwdriver) and equipments +(e.g., oscilloscope). The 51 egocentric video sequences are densely annotated +with a rich set of labels that enable the systematic study of human behavior in +the industrial domain. We provide benchmarks on four tasks related to human +behavior: 1) untrimmed temporal detection of human-object interactions, 2) +egocentric human-object interaction detection, 3) short-term object interaction +anticipation and 4) natural language understanding of intents and entities. +Baseline results show that the ENIGMA-51 dataset poses a challenging benchmark +to study human behavior in industrial scenarios. We publicly release the +dataset at https://iplab.dmi.unict.it/ENIGMA-51. + +
+
+
+
+
+ + ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models + + +
+ Recent advances in text-to-image generation models have unlocked vast +potential for visual creativity. However, these models struggle with generation +of consistent characters, a crucial aspect for numerous real-world applications +such as story visualization, game development asset design, advertising, and +more. Current methods typically rely on multiple pre-existing images of the +target character or involve labor-intensive manual processes. In this work, we +propose a fully automated solution for consistent character generation, with +the sole input being a text prompt. We introduce an iterative procedure that, +at each stage, identifies a coherent set of images sharing a similar identity +and extracts a more consistent identity from this set. Our quantitative +analysis demonstrates that our method strikes a better balance between prompt +alignment and identity consistency compared to the baseline methods, and these +findings are reinforced by a user study. To conclude, we showcase several +practical applications of our approach. Project page is available at +https://omriavrahami.com/the-chosen-one + +
+
+ comment: Project page is available at https://omriavrahami.com/the-chosen-one +
+
+
+
+
+ + ♻ ☆ LLM-driven Multimodal Target Volume Contouring in Radiation Oncology + + +
+ Target volume contouring for radiation therapy is considered significantly +more challenging than the normal organ segmentation tasks as it necessitates +the utilization of both image and text-based clinical information. Inspired by +the recent advancement of large language models (LLMs) that can facilitate the +integration of the textural information and images, here we present a novel +LLM-driven multi-modal AI that utilizes the clinical text information and is +applicable to the challenging task of target volume contouring for radiation +therapy, and validate it within the context of breast cancer radiation therapy +target volume contouring. Using external validation and data-insufficient +environments, which attributes highly conducive to real-world applications, we +demonstrate that the proposed model exhibits markedly improved performance +compared to conventional vision-only AI models, particularly exhibiting robust +generalization performance and data-efficiency. To our best knowledge, this is +the first LLM-driven multimodal AI model that integrates the clinical text +information into target volume delineation for radiation oncology. + +
+
+
+
+
+ + ♻ ☆ NEURAL MARIONETTE: A Transformer-based Multi-action Human Motion + Synthesis System + + +
+ We present a neural network-based system for long-term, multi-action human +motion synthesis. The system, dubbed as NEURAL MARIONETTE, can produce +high-quality and meaningful motions with smooth transitions from simple user +input, including a sequence of action tags with expected action duration, and +optionally a hand-drawn moving trajectory if the user specifies. The core of +our system is a novel Transformer-based motion generation model, namely +MARIONET, which can generate diverse motions given action tags. Different from +existing motion generation models, MARIONET utilizes contextual information +from the past motion clip and future action tag, dedicated to generating +actions that can smoothly blend historical and future actions. Specifically, +MARIONET first encodes target action tag and contextual information into an +action-level latent code. The code is unfolded into frame-level control signals +via a time unrolling module, which could be then combined with other +frame-level control signals like the target trajectory. Motion frames are then +generated in an auto-regressive way. By sequentially applying MARIONET, the +system NEURAL MARIONETTE can robustly generate long-term, multi-action motions +with the help of two simple schemes, namely "Shadow Start" and "Action +Revision". Along with the novel system, we also present a new dataset dedicated +to the multi-action motion synthesis task, which contains both action tags and +their contextual information. Extensive experiments are conducted to study the +action accuracy, naturalism, and transition smoothness of the motions generated +by our system. + +
+
+
+
+
+ + ♻ ☆ 3DGAUnet: 3D generative adversarial networks with a 3D U-Net based + generator to achieve the accurate and effective synthesis of clinical tumor + image data for pancreatic cancer + + +
+ Pancreatic ductal adenocarcinoma (PDAC) presents a critical global health +challenge, and early detection is crucial for improving the 5-year survival +rate. Recent medical imaging and computational algorithm advances offer +potential solutions for early diagnosis. Deep learning, particularly in the +form of convolutional neural networks (CNNs), has demonstrated success in +medical image analysis tasks, including classification and segmentation. +However, the limited availability of clinical data for training purposes +continues to provide a significant obstacle. Data augmentation, generative +adversarial networks (GANs), and cross-validation are potential techniques to +address this limitation and improve model performance, but effective solutions +are still rare for 3D PDAC, where contrast is especially poor owing to the high +heterogeneity in both tumor and background tissues. In this study, we developed +a new GAN-based model, named 3DGAUnet, for generating realistic 3D CT images of +PDAC tumors and pancreatic tissue, which can generate the interslice connection +data that the existing 2D CT image synthesis models lack. Our innovation is to +develop a 3D U-Net architecture for the generator to improve shape and texture +learning for PDAC tumors and pancreatic tissue. Our approach offers a promising +path to tackle the urgent requirement for creative and synergistic methods to +combat PDAC. The development of this GAN-based model has the potential to +alleviate data scarcity issues, elevate the quality of synthesized data, and +thereby facilitate the progression of deep learning models to enhance the +accuracy and early detection of PDAC tumors, which could profoundly impact +patient outcomes. Furthermore, this model has the potential to be adapted to +other types of solid tumors, hence making significant contributions to the +field of medical imaging in terms of image processing models. + +
+
+ comment: Published on Cancers: Shi, Yu, Hannah Tang, Michael J. Baine, Michael + A. Hollingsworth, Huijing Du, Dandan Zheng, Chi Zhang, and Hongfeng Yu. 2023. + "3DGAUnet: 3D Generative Adversarial Networks with a 3D U-Net Based Generator + to Achieve the Accurate and Effective Synthesis of Clinical Tumor Image Data + for Pancreatic Cancer" Cancers 15, no. 23: 5496 +
+
+
+
+
+ + ♻ ☆ CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV + Perception + + +
+ Perception is crucial in the realm of autonomous driving systems, where +bird's eye view (BEV)-based architectures have recently reached +state-of-the-art performance. The desirability of self-supervised +representation learning stems from the expensive and laborious process of +annotating 2D and 3D data. Although previous research has investigated +pretraining methods for both LiDAR and camera-based 3D object detection, a +unified pretraining framework for multimodal BEV perception is missing. In this +study, we introduce CALICO, a novel framework that applies contrastive +objectives to both LiDAR and camera backbones. Specifically, CALICO +incorporates two stages: point-region contrast (PRC) and region-aware +distillation (RAD). PRC better balances the region- and scene-level +representation learning on the LiDAR modality and offers significant +performance improvement compared to existing methods. RAD effectively achieves +contrastive distillation on our self-trained teacher model. CALICO's efficacy +is substantiated by extensive evaluations on 3D object detection and BEV map +segmentation tasks, where it delivers significant performance improvements. +Notably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and +mAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection +against adversarial attacks and corruption. Additionally, our framework can be +tailored to different backbones and heads, positioning it as a promising +approach for multimodal BEV perception. + +
+
+
+
+
+ + ♻ ☆ Perceptual Assessment and Optimization of High Dynamic Range Image + Rendering + + +
+ The increasing popularity of high dynamic range (HDR) imaging stems from its +ability to faithfully capture luminance levels in natural scenes. However, HDR +image quality assessment has been insufficiently addressed. Existing models are +mostly designed for low dynamic range (LDR) images, which exhibit poorly +correlated with human perception of HDR image quality. To fill this gap, we +propose a family of HDR quality metrics by transferring the recent advancements +in LDR domain. The key step in our approach is to employ a simple inverse +display model to decompose an HDR image into a stack of LDR images with varying +exposures. Subsequently, these LDR images are evaluated using state-of-the-art +LDR quality metrics. Our family of HDR quality models offer three notable +advantages. First, specific exposures (i.e., luminance ranges) can be weighted +to emphasize their assessment when calculating the overall quality score. +Second, our HDR quality metrics directly inherit the capabilities of their base +LDR quality models in assessing LDR images. Third, our metrics do not rely on +human perceptual data of HDR image quality for re-calibration. Experiments +conducted on four human-rated HDR image quality datasets indicate that our HDR +quality metrics consistently outperform existing methods, including the HDR-VDP +family. Furthermore, we demonstrate the promise of our models in the perceptual +optimization of HDR novel view synthesis. + +
+
+
+
+
+ + ♻ ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon has elevated the issue of source bias in text retrieval for web +searches. Specifically, neural retrieval models tend to rank generated texts +higher than human-written texts. In this paper, we extend the study of this +bias to cross-modal retrieval. Firstly, we successfully construct a suitable +benchmark to explore the existence of the bias. Subsequent extensive +experiments on this benchmark reveal that AI-generated images introduce an +invisible relevance bias to text-image retrieval models. Specifically, our +experiments show that text-image retrieval models tend to rank the AI-generated +images higher than the real images, even though the AI-generated images do not +exhibit more visually relevant features to the query than real images. This +invisible relevance bias is prevalent across retrieval models with varying +training data and architectures. Furthermore, our subsequent exploration +reveals that the inclusion of AI-generated images in the training data of the +retrieval models exacerbates the invisible relevance bias. The above phenomenon +triggers a vicious cycle, which makes the invisible relevance bias become more +and more serious. To elucidate the potential causes of invisible relevance and +address the aforementioned issues, we introduce an effective training method +aimed at alleviating the invisible relevance bias. Subsequently, we apply our +proposed debiasing method to retroactively identify the causes of invisible +relevance, revealing that the AI-generated images induce the image encoder to +embed additional information into their representation. This information +exhibits a certain consistency across generated images with different semantics +and can make the retriever estimate a higher relevance score. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ DynamiCrafter: Animating Open-domain Images with Video Diffusion Priors + + +
+ Animating a still image offers an engaging visual experience. Traditional +image animation techniques mainly focus on animating natural scenes with +stochastic dynamics (e.g. clouds and fluid) or domain-specific motions (e.g. +human hair or body motions), and thus limits their applicability to more +general visual content. To overcome this limitation, we explore the synthesis +of dynamic content for open-domain images, converting them into animated +videos. The key idea is to utilize the motion prior of text-to-video diffusion +models by incorporating the image into the generative process as guidance. +Given an image, we first project it into a text-aligned rich context +representation space using a query transformer, which facilitates the video +model to digest the image content in a compatible fashion. However, some visual +details still struggle to be preserved in the resultant videos. To supplement +with more precise image information, we further feed the full image to the +diffusion model by concatenating it with the initial noises. Experimental +results show that our proposed method can produce visually convincing and more +logical & natural motions, as well as higher conformity to the input image. +Comparative evaluation demonstrates the notable superiority of our approach +over existing competitors. + +
+
+ comment: Project page: https://doubiiu.github.io/projects/DynamiCrafter +
+
+
+
+
+ + ♻ ☆ Efficient Perception, Planning, and Control Algorithms for Vision-Based + Automated Vehicles + + +
+ Autonomous vehicles have limited computational resources; hence, their +control systems must be efficient. The cost and size of sensors have limited +the development of self-driving cars. To overcome these restrictions, this +study proposes an efficient framework for the operation of vision-based +automatic vehicles; the framework requires only a monocular camera and a few +inexpensive radars. The proposed algorithm comprises a multi-task UNet (MTUNet) +network for extracting image features and constrained iterative linear +quadratic regulator (CILQR) and vision predictive control (VPC) modules for +rapid motion planning and control. MTUNet is designed to simultaneously solve +lane line segmentation, the ego vehicle's heading angle regression, road type +classification, and traffic object detection tasks at approximately 40 FPS +(frames per second) for 228 x 228 pixel RGB input images. The CILQR controllers +then use the MTUNet outputs and radar data as inputs to produce driving +commands for lateral and longitudinal vehicle guidance within only 1 ms. In +particular, the VPC algorithm is included to reduce steering command latency to +below actuator latency to prevent self-driving vehicle performance degradation +during tight turns. The VPC algorithm uses road curvature data from MTUNet to +estimate the correction of the current steering angle at a look-ahead point to +adjust the turning amount. Including the VPC algorithm in a VPC-CILQR +controller on curvy roads leads to higher performance than CILQR alone. Our +experiments demonstrate that the proposed autonomous driving system, which does +not require high-definition maps, could be applied in current autonomous +vehicles. + +
+
+ comment: 10 figures, 13 pages +
+
+
+
+
+ + ♻ ☆ Continual Test-time Domain Adaptation via Dynamic Sample Selection + + +
+ The objective of Continual Test-time Domain Adaptation (CTDA) is to gradually +adapt a pre-trained model to a sequence of target domains without accessing the +source data. This paper proposes a Dynamic Sample Selection (DSS) method for +CTDA. DSS consists of dynamic thresholding, positive learning, and negative +learning processes. Traditionally, models learn from unlabeled unknown +environment data and equally rely on all samples' pseudo-labels to update their +parameters through self-training. However, noisy predictions exist in these +pseudo-labels, so all samples are not equally trustworthy. Therefore, in our +method, a dynamic thresholding module is first designed to select suspected +low-quality from high-quality samples. The selected low-quality samples are +more likely to be wrongly predicted. Therefore, we apply joint positive and +negative learning on both high- and low-quality samples to reduce the risk of +using wrong information. We conduct extensive experiments that demonstrate the +effectiveness of our proposed method for CTDA in the image domain, +outperforming the state-of-the-art results. Furthermore, our approach is also +evaluated in the 3D point cloud domain, showcasing its versatility and +potential for broader applicability. + +
+
+ comment: 2024 IEEE/CVF Winter Conference on Applications of Computer Vision +
+
+
+
+
+ + ♻ ☆ A Closer Look at Audio-Visual Segmentation + + +
+ Audio-visual segmentation (AVS) is a complex task that involves accurately +segmenting the corresponding sounding object based on audio-visual queries. +Successful audio-visual learning requires two essential components: 1) an +unbiased dataset with high-quality pixel-level multi-class labels, and 2) a +model capable of effectively linking audio information with its corresponding +visual object. However, these two requirements are only partially addressed by +current methods, with training sets containing biased audio-visual data, and +models that generalise poorly beyond this biased training set. In this work, we +propose a new strategy to build cost-effective and relatively unbiased +audio-visual semantic segmentation benchmarks. Our strategy, called Visual +Post-production (VPO), explores the observation that it is not necessary to +have explicit audio-visual pairs extracted from single video sources to build +such benchmarks. We also refine the previously proposed AVSBench to transform +it into the audio-visual semantic segmentation benchmark AVSBench-Single+. +Furthermore, this paper introduces a new pixel-wise audio-visual contrastive +learning method to enable a better generalisation of the model beyond the +training set. We verify the validity of the VPO strategy by showing that +state-of-the-art (SOTA) models trained with datasets built by matching audio +and visual data from different sources or with datasets containing audio and +visual data from the same video source produce almost the same accuracy. Then, +using the proposed VPO benchmarks and AVSBench-Single+, we show that our method +produces more accurate audio-visual semantic segmentation than SOTA models. +Code and dataset will be available. + +
+
+
+
+
+ + ♻ ☆ AdaptGuard: Defending Against Universal Attacks for Model Adaptation ICCV2023 + + +
+ Model adaptation aims at solving the domain transfer problem under the +constraint of only accessing the pretrained source models. With the increasing +considerations of data privacy and transmission efficiency, this paradigm has +been gaining recent popularity. This paper studies the vulnerability to +universal attacks transferred from the source domain during model adaptation +algorithms due to the existence of malicious providers. We explore both +universal adversarial perturbations and backdoor attacks as loopholes on the +source side and discover that they still survive in the target models after +adaptation. To address this issue, we propose a model preprocessing framework, +named AdaptGuard, to improve the security of model adaptation algorithms. +AdaptGuard avoids direct use of the risky source parameters through knowledge +distillation and utilizes the pseudo adversarial samples under adjusted radius +to enhance the robustness. AdaptGuard is a plug-and-play module that requires +neither robust pretrained models nor any changes for the following model +adaptation algorithms. Extensive results on three commonly used datasets and +two popular adaptation methods validate that AdaptGuard can effectively defend +against universal attacks and maintain clean accuracy in the target domain +simultaneously. We hope this research will shed light on the safety and +robustness of transfer learning. Code is available at +https://github.com/TomSheng21/AdaptGuard. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ RealignDiff: Boosting Text-to-Image Diffusion Model with Coarse-to-fine + Semantic Re-alignment + + +
+ Recent advances in text-to-image diffusion models have achieved remarkable +success in generating high-quality, realistic images from textual descriptions. +However, these approaches have faced challenges in precisely aligning the +generated visual content with the textual concepts described in the prompts. In +this paper, we propose a two-stage coarse-to-fine semantic re-alignment method, +named RealignDiff, aimed at improving the alignment between text and images in +text-to-image diffusion models. In the coarse semantic re-alignment phase, a +novel caption reward, leveraging the BLIP-2 model, is proposed to evaluate the +semantic discrepancy between the generated image caption and the given text +prompt. Subsequently, the fine semantic re-alignment stage employs a local +dense caption generation module and a re-weighting attention modulation module +to refine the previously generated images from a local semantic view. +Experimental results on the MS-COCO benchmark demonstrate that the proposed +two-stage coarse-to-fine semantic re-alignment method outperforms other +baseline re-alignment techniques by a substantial margin in both visual quality +and semantic similarity with the input prompt. + +
+
+
+
+
+ + ♻ ☆ TetraSphere: A Neural Descriptor for O(3)-Invariant Point Cloud Analysis + + +
+ In many practical applications, 3D point cloud analysis requires rotation +invariance. In this paper, we present a learnable descriptor invariant under 3D +rotations and reflections, i.e., the O(3) actions, utilizing the recently +introduced steerable 3D spherical neurons and vector neurons. Specifically, we +propose an embedding of the 3D spherical neurons into 4D vector neurons, which +leverages end-to-end training of the model. In our approach, we perform +TetraTransform--an equivariant embedding of the 3D input into 4D, constructed +from the steerable neurons--and extract deeper O(3)-equivariant features using +vector neurons. This integration of the TetraTransform into the VN-DGCNN +framework, termed TetraSphere, negligibly increases the number of parameters by +less than 0.0002%. TetraSphere sets a new state-of-the-art performance +classifying randomly rotated real-world object scans of the challenging subsets +of ScanObjectNN. Additionally, TetraSphere outperforms all equivariant methods +on randomly rotated synthetic data: classifying objects from ModelNet40 and +segmenting parts of the ShapeNet shapes. Thus, our results reveal the practical +value of steerable 3D spherical neurons for learning in 3D Euclidean space. + +
+
+
+
+
+ + ♻ ☆ DoUnseen: Tuning-Free Class-Adaptive Object Detection of Unseen Objects + for Robotic Grasping + + +
+ How can we segment varying numbers of objects where each specific object +represents its own separate class? To make the problem even more realistic, how +can we add and delete classes on the fly without retraining or fine-tuning? +This is the case of robotic applications where no datasets of the objects exist +or application that includes thousands of objects (E.g., in logistics) where it +is impossible to train a single model to learn all of the objects. Most current +research on object segmentation for robotic grasping focuses on class-level +object segmentation (E.g., box, cup, bottle), closed sets (specific objects of +a dataset; for example, YCB dataset), or deep learning-based template matching. +In this work, we are interested in open sets where the number of classes is +unknown, varying, and without pre-knowledge about the objects' types. We +consider each specific object as its own separate class. Our goal is to develop +an object detector that requires no fine-tuning and can add any object as a +class just by capturing a few images of the object. Our main idea is to break +the segmentation pipelines into two steps by combining unseen object +segmentation networks cascaded by class-adaptive classifiers. We evaluate our +class-adaptive object detector on unseen datasets and compare it to a trained +Mask R-CNN on those datasets. The results show that the performance varies from +practical to unsuitable depending on the environment setup and the objects +being handled. The code is available in our DoUnseen library repository. + +
+
+ comment: presented at RSS 2023 Workshop on Perception and Manipulation + Challenges for Warehouse Automation +
+
+
+
+
+ + ♻ ☆ RealLiFe: Real-Time Light Field Reconstruction via Hierarchical Sparse + Gradient Descent + + +
+ With the rise of Extended Reality (XR) technology, there is a growing need +for real-time light field generation from sparse view inputs. Existing methods +can be classified into offline techniques, which can generate high-quality +novel views but at the cost of long inference/training time, and online +methods, which either lack generalizability or produce unsatisfactory results. +However, we have observed that the intrinsic sparse manifold of Multi-plane +Images (MPI) enables a significant acceleration of light field generation while +maintaining rendering quality. Based on this insight, we introduce EffLiFe, a +novel light field optimization method, which leverages the proposed +Hierarchical Sparse Gradient Descent (HSGD) to produce high-quality light +fields from sparse view images in real time. Technically, the coarse MPI of a +scene is first generated using a 3D CNN, and it is further sparsely optimized +by focusing only on important MPI gradients in a few iterations. Nevertheless, +relying solely on optimization can lead to artifacts at occlusion boundaries. +Therefore, we propose an occlusion-aware iterative refinement module that +removes visual artifacts in occluded regions by iteratively filtering the +input. Extensive experiments demonstrate that our method achieves comparable +visual quality while being 100x faster on average than state-of-the-art offline +methods and delivering better performance (about 2 dB higher in PSNR) compared +to other online approaches. + +
+
+ comment: Submitted to IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ PanoVOS: Bridging Non-panoramic and Panoramic Views with Transformer for + Video Segmentation + + +
+ Panoramic videos contain richer spatial information and have attracted +tremendous amounts of attention due to their exceptional experience in some +fields such as autonomous driving and virtual reality. However, existing +datasets for video segmentation only focus on conventional planar images. To +address the challenge, in this paper, we present a panoramic video dataset, +PanoVOS. The dataset provides 150 videos with high video resolutions and +diverse motions. To quantify the domain gap between 2D planar videos and +panoramic videos, we evaluate 15 off-the-shelf video object segmentation (VOS) +models on PanoVOS. Through error analysis, we found that all of them fail to +tackle pixel-level content discontinues of panoramic videos. Thus, we present a +Panoramic Space Consistency Transformer (PSCFormer), which can effectively +utilize the semantic boundary information of the previous frame for pixel-level +matching with the current frame. Extensive experiments demonstrate that +compared with the previous SOTA models, our PSCFormer network exhibits a great +advantage in terms of segmentation results under the panoramic setting. Our +dataset poses new challenges in panoramic VOS and we hope that our PanoVOS can +advance the development of panoramic segmentation/tracking. + +
+
+
+
+
+ + ♻ ☆ Prompt-based test-time real image dehazing: a novel pipeline + + +
+ Existing methods attempt to improve models' generalization ability on +real-world hazy images by exploring well-designed training schemes (e.g., +CycleGAN, prior loss). However, most of them need very complicated training +procedures to achieve satisfactory results. In this work, we present a totally +novel testing pipeline called Prompt-based Test-Time Dehazing (PTTD) to help +generate visually pleasing results of real-captured hazy images during the +inference phase. We experimentally find that given a dehazing model trained on +synthetic data, by fine-tuning the statistics (i.e., mean and standard +deviation) of encoding features, PTTD is able to narrow the domain gap, +boosting the performance of real image dehazing. Accordingly, we first apply a +prompt generation module (PGM) to generate a visual prompt, which is the source +of appropriate statistical perturbations for mean and standard deviation. And +then, we employ the feature adaptation module (FAM) into the existing dehazing +models for adjusting the original statistics with the guidance of the generated +prompt. Note that, PTTD is model-agnostic and can be equipped with various +state-of-the-art dehazing models trained on synthetic hazy-clean pairs. +Extensive experimental results demonstrate that our PTTD is flexible meanwhile +achieves superior performance against state-of-the-art dehazing methods in +real-world scenarios. The source code of our PTTD will be made available at +https://github.com/cecret3350/PTTD-Dehazing. + +
+
+ comment: update github link (https://github.com/cecret3350/PTTD-Dehazing) +
+
+
+
+
+ + ♻ ☆ MRGazer: Decoding Eye Gaze Points from Functional Magnetic Resonance + Imaging in Individual Space + + +
+ Eye-tracking research has proven valuable in understanding numerous cognitive +functions. Recently, Frey et al. provided an exciting deep learning method for +learning eye movements from fMRI data. However, it needed to co-register fMRI +into standard space to obtain eyeballs masks, and thus required additional +templates and was time consuming. To resolve this issue, in this paper, we +propose a framework named MRGazer for predicting eye gaze points from fMRI in +individual space. The MRGazer consisted of eyeballs extraction module and a +residual network-based eye gaze prediction. Compared to the previous method, +the proposed framework skips the fMRI co-registration step, simplifies the +processing protocol and achieves end-to-end eye gaze regression. The proposed +method achieved superior performance in a variety of eye movement tasks than +the co-registration-based method, and delivered objective results within a +shorter time (~ 0.02 Seconds for each volume) than prior method (~0.3 Seconds +for each volume). + +
+
+
+
+
+ + ♻ ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation + + +
+ Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced +significant growth and have been extensively employed to adapt large vision and +language models to various domains, enabling satisfactory model performance +with minimal computational needs. Despite these advances, more research has yet +to delve into potential PEFT applications in real-life scenarios, particularly +in the critical domains of remote sensing and crop monitoring. The diversity of +climates across different regions and the need for comprehensive large-scale +datasets have posed significant obstacles to accurately identify crop types +across varying geographic locations and changing growing seasons. This study +seeks to bridge this gap by comprehensively exploring the feasibility of +cross-area and cross-year out-of-distribution generalization using the +State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to +explore PEFT approaches for crop monitoring. Specifically, we focus on adapting +the SOTA TSViT model to address winter wheat field segmentation, a critical +task for crop monitoring and food security. This adaptation process involves +integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and +prompt tuning. Using PEFT techniques, we achieved notable results comparable to +those achieved using full fine-tuning methods while training only a mere 0.7% +parameters of the whole TSViT architecture. The in-house labeled data-set, +referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated +polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over +five consecutive years. Using Sentinel-2 images, our model achieved a 84% +F1-score. We intend to publicly release the Lebanese winter wheat data set, +code repository, and model weights. + +
+
+
+
+
+ + ♻ ☆ NeRFlame: FLAME-based conditioning of NeRF for 3D face rendering + + +
+ Traditional 3D face models are based on mesh representations with texture. +One of the most important models is FLAME (Faces Learned with an Articulated +Model and Expressions), which produces meshes of human faces that are fully +controllable. Unfortunately, such models have problems with capturing geometric +and appearance details. In contrast to mesh representation, the neural radiance +field (NeRF) produces extremely sharp renders. However, implicit methods are +hard to animate and do not generalize well to unseen expressions. It is not +trivial to effectively control NeRF models to obtain face manipulation. + The present paper proposes a novel approach, named NeRFlame, which combines +the strengths of both NeRF and FLAME methods. Our method enables high-quality +rendering capabilities of NeRF while also offering complete control over the +visual appearance, similar to FLAME. In contrast to traditional NeRF-based +structures that use neural networks for RGB color and volume density modeling, +our approach utilizes the FLAME mesh as a distinct density volume. +Consequently, color values exist only in the vicinity of the FLAME mesh. This +FLAME framework is seamlessly incorporated into the NeRF architecture for +predicting RGB colors, enabling our model to explicitly represent volume +density and implicitly capture RGB colors. + +
+
+
+
+
+ + ♻ ☆ RankFeat&RankWeight: Rank-1 Feature/Weight Removal for + Out-of-distribution Detection + + +
+ The task of out-of-distribution (OOD) detection is crucial for deploying +machine learning models in real-world settings. In this paper, we observe that +the singular value distributions of the in-distribution (ID) and OOD features +are quite different: the OOD feature matrix tends to have a larger dominant +singular value than the ID feature, and the class predictions of OOD samples +are largely determined by it. This observation motivates us to propose +\texttt{RankFeat}, a simple yet effective \emph{post hoc} approach for OOD +detection by removing the rank-1 matrix composed of the largest singular value +and the associated singular vectors from the high-level feature. +\texttt{RankFeat} achieves \emph{state-of-the-art} performance and reduces the +average false positive rate (FPR95) by 17.90\% compared with the previous best +method. The success of \texttt{RankFeat} motivates us to investigate whether a +similar phenomenon would exist in the parameter matrices of neural networks. We +thus propose \texttt{RankWeight} which removes the rank-1 weight from the +parameter matrices of a single deep layer. Our \texttt{RankWeight}is also +\emph{post hoc} and only requires computing the rank-1 matrix once. As a +standalone approach, \texttt{RankWeight} has very competitive performance +against other methods across various backbones. Moreover, \texttt{RankWeight} +enjoys flexible compatibility with a wide range of OOD detection methods. The +combination of \texttt{RankWeight} and \texttt{RankFeat} refreshes the new +\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\% on +the ImageNet-1k benchmark. Extensive ablation studies and comprehensive +theoretical analyses are presented to support the empirical results. + +
+
+ comment: submitted to T-PAMI. arXiv admin note: substantial text overlap with + arXiv:2209.08590 +
+
+
+
+
+ + ♻ ☆ Uncovering the Hidden Cost of Model Compression + + +
+ In the era of resource-intensive foundation models, efficient adaptation in +downstream tasks has become paramount. Visual Prompting (VP), inspired by +prompting in Large Language Models (LLMs), has emerged as a key transfer +learning method in computer vision. Aligned with the growing significance of +efficiency, research in model compression has become pivotal to alleviate the +computational burden in both training and deploying over-parameterized neural +networks. A key goal in model compression is the development of sparse models +capable of matching or surpassing the performance of their over-parameterized, +dense counterparts. While prior research has explored the impact of model +sparsity on transfer learning, its effects on visual prompting-based transfer +remain unclear. This study addresses this gap, revealing that model sparsity +adversely affects the performance of visual prompting-based transfer, +particularly in low-data-volume scenarios. Furthermore, our findings highlight +the negative influence of sparsity on the calibration of downstream +visual-prompted models. This empirical exploration calls for a nuanced +understanding beyond accuracy in sparse settings, opening avenues for further +research in Visual Prompting for sparse models. Code and logs can be accessed +at https://github.com/landskape-ai/Reprogram_LT . + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Towards Omni-supervised Referring Expression Segmentation + + +
+ Referring Expression Segmentation (RES) is an emerging task in computer +vision, which segments the target instances in images based on text +descriptions. However, its development is plagued by the expensive segmentation +labels. To address this issue, we propose a new learning task for RES called +Omni-supervised Referring Expression Segmentation (Omni-RES), which aims to +make full use of unlabeled, fully labeled and weakly labeled data, e.g., +referring points or grounding boxes, for efficient RES training. To accomplish +this task, we also propose a novel yet strong baseline method for Omni-RES +based on the recently popular teacher-student learning, where the weak labels +are not directly transformed into supervision signals but used as a yardstick +to select and refine high-quality pseudo-masks for teacher-student learning. To +validate the proposed Omni-RES method, we apply it to a set of state-of-the-art +RES models and conduct extensive experiments on a bunch of RES datasets. The +experimental results yield the obvious merits of Omni-RES than the +fully-supervised and semi-supervised training schemes. For instance, with only +10% fully labeled data, Omni-RES can help the base model achieve 100% fully +supervised performance, and it also outperform the semi-supervised alternative +by a large margin, e.g., +14.93% on RefCOCO and +14.95% on RefCOCO+, +respectively. More importantly, Omni-RES also enable the use of large-scale +vision-langauges like Visual Genome to facilitate low-cost RES training, and +achieve new SOTA performance of RES, e.g., 80.66 on RefCOCO. + +
+
+
+
+
+ + ♻ ☆ R&B: Region and Boundary Aware Zero-shot Grounded Text-to-image + Generation + + +
+ Recent text-to-image (T2I) diffusion models have achieved remarkable progress +in generating high-quality images given text-prompts as input. However, these +models fail to convey appropriate spatial composition specified by a layout +instruction. In this work, we probe into zero-shot grounded T2I generation with +diffusion models, that is, generating images corresponding to the input layout +information without training auxiliary modules or finetuning diffusion models. +We propose a Region and Boundary (R&B) aware cross-attention guidance approach +that gradually modulates the attention maps of diffusion model during +generative process, and assists the model to synthesize images (1) with high +fidelity, (2) highly compatible with textual input, and (3) interpreting layout +instructions accurately. Specifically, we leverage the discrete sampling to +bridge the gap between consecutive attention maps and discrete layout +constraints, and design a region-aware loss to refine the generative layout +during diffusion process. We further propose a boundary-aware loss to +strengthen object discriminability within the corresponding regions. +Experimental results show that our method outperforms existing state-of-the-art +zero-shot grounded T2I generation methods by a large margin both qualitatively +and quantitatively on several benchmarks. + +
+
+ comment: Preprint. Under review. Project page: + https://sagileo.github.io/Region-and-Boundary +
+
+
+
+
+ + ♻ ☆ Concept Sliders: LoRA Adaptors for Precise Control in Diffusion Models + + +
+ We present a method to create interpretable concept sliders that enable +precise control over attributes in image generations from diffusion models. Our +approach identifies a low-rank parameter direction corresponding to one concept +while minimizing interference with other attributes. A slider is created using +a small set of prompts or sample images; thus slider directions can be created +for either textual or visual concepts. Concept Sliders are plug-and-play: they +can be composed efficiently and continuously modulated, enabling precise +control over image generation. In quantitative experiments comparing to +previous editing techniques, our sliders exhibit stronger targeted edits with +lower interference. We showcase sliders for weather, age, styles, and +expressions, as well as slider compositions. We show how sliders can transfer +latents from StyleGAN for intuitive editing of visual concepts for which +textual description is difficult. We also find that our method can help address +persistent quality issues in Stable Diffusion XL including repair of object +deformations and fixing distorted hands. Our code, data, and trained sliders +are available at https://sliders.baulab.info/ + +
+
+
+
+
+ + ♻ ☆ Breaking Modality Disparity: Harmonized Representation for Infrared and + Visible Image Registration + + +
+ Since the differences in viewing range, resolution and relative position, the +multi-modality sensing module composed of infrared and visible cameras needs to +be registered so as to have more accurate scene perception. In practice, manual +calibration-based registration is the most widely used process, and it is +regularly calibrated to maintain accuracy, which is time-consuming and +labor-intensive. To cope with these problems, we propose a scene-adaptive +infrared and visible image registration. Specifically, in regard of the +discrepancy between multi-modality images, an invertible translation process is +developed to establish a modality-invariant domain, which comprehensively +embraces the feature intensity and distribution of both infrared and visible +modalities. We employ homography to simulate the deformation between different +planes and develop a hierarchical framework to rectify the deformation inferred +from the proposed latent representation in a coarse-to-fine manner. For that, +the advanced perception ability coupled with the residual estimation conducive +to the regression of sparse offsets, and the alternate correlation search +facilitates a more accurate correspondence matching. Moreover, we propose the +first ground truth available misaligned infrared and visible image dataset, +involving three synthetic sets and one real-world set. Extensive experiments +validate the effectiveness of the proposed method against the +state-of-the-arts, advancing the subsequent applications. + +
+
+ comment: 10 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Discrete approximations of Gaussian smoothing and Gaussian derivatives + + +
+ This paper develops an in-depth treatment concerning the problem of +approximating the Gaussian smoothing and Gaussian derivative computations in +scale-space theory for application on discrete data. With close connections to +previous axiomatic treatments of continuous and discrete scale-space theory, we +consider three main ways discretizing these scale-space operations in terms of +explicit discrete convolutions, based on either (i) sampling the Gaussian +kernels and the Gaussian derivative kernels, (ii) locally integrating the +Gaussian kernels and the Gaussian derivative kernels over each pixel support +region and (iii) basing the scale-space analysis on the discrete analogue of +the Gaussian kernel, and then computing derivative approximations by applying +small-support central difference operators to the spatially smoothed image +data. + We study the properties of these three main discretization methods both +theoretically and experimentally, and characterize their performance by +quantitative measures, including the results they give rise to with respect to +the task of scale selection, investigated for four different use cases, and +with emphasis on the behaviour at fine scales. The results show that the +sampled Gaussian kernels and derivatives as well as the integrated Gaussian +kernels and derivatives perform very poorly at very fine scales. At very fine +scales, the discrete analogue of the Gaussian kernel with its corresponding +discrete derivative approximations performs substantially better. The sampled +Gaussian kernel and the sampled Gaussian derivatives do, on the other hand, +lead to numerically very good approximations of the corresponding continuous +results, when the scale parameter is sufficiently large, in the experiments +presented in the paper, when the scale parameter is greater than a value of +about 1, in units of the grid spacing. + +
+
+ comment: 38 pages, 34 figures +
+
+
+
+
+ + ♻ ☆ Scale-Adaptive Feature Aggregation for Efficient Space-Time Video + Super-Resolution WACV2024 + + +
+ The Space-Time Video Super-Resolution (STVSR) task aims to enhance the visual +quality of videos, by simultaneously performing video frame interpolation (VFI) +and video super-resolution (VSR). However, facing the challenge of the +additional temporal dimension and scale inconsistency, most existing STVSR +methods are complex and inflexible in dynamically modeling different motion +amplitudes. In this work, we find that choosing an appropriate processing scale +achieves remarkable benefits in flow-based feature propagation. We propose a +novel Scale-Adaptive Feature Aggregation (SAFA) network that adaptively selects +sub-networks with different processing scales for individual samples. +Experiments on four public STVSR benchmarks demonstrate that SAFA achieves +state-of-the-art performance. Our SAFA network outperforms recent +state-of-the-art methods such as TMNet and VideoINR by an average improvement +of over 0.5dB on PSNR, while requiring less than half the number of parameters +and only 1/3 computational costs. + +
+
+ comment: WACV2024, 16 pages +
+
+
+
+
+ + ♻ ☆ Point, Segment and Count: A Generalized Framework for Object Counting + + +
+ Class-agnostic object counting aims to count all objects in an image with +respect to example boxes or class names, \emph{a.k.a} few-shot and zero-shot +counting. Current state-of-the-art methods highly rely on density maps to +predict object counts, which lacks model interpretability. In this paper, we +propose a generalized framework for both few-shot and zero-shot object counting +based on detection. Our framework combines the superior advantages of two +foundation models without compromising their zero-shot capability: (\textbf{i}) +SAM to segment all possible objects as mask proposals, and (\textbf{ii}) CLIP +to classify proposals to obtain accurate object counts. However, this strategy +meets the obstacles of efficiency overhead and the small crowded objects that +cannot be localized and distinguished. To address these issues, our framework, +termed PseCo, follows three steps: point, segment, and count. Specifically, we +first propose a class-agnostic object localization to provide accurate but +least point prompts for SAM, which consequently not only reduces computation +costs but also avoids missing small objects. Furthermore, we propose a +generalized object classification that leverages CLIP image/text embeddings as +the classifier, following a hierarchical knowledge distillation to obtain +discriminative classifications among hierarchical mask proposals. Extensive +experimental results on FSC-147 dataset demonstrate that PseCo achieves +state-of-the-art performance in both few-shot/zero-shot object +counting/detection, with additional results on large-scale COCO and LVIS +datasets. The source code is available at +\url{https://github.com/Hzzone/PseCo}. + +
+
+ comment: Fix typos +
+
+
+
+
+ + ♻ ☆ DriveDreamer: Towards Real-world-driven World Models for Autonomous + Driving + + +
+ World models, especially in autonomous driving, are trending and drawing +extensive attention due to their capacity for comprehending driving +environments. The established world model holds immense potential for the +generation of high-quality driving videos, and driving policies for safe +maneuvering. However, a critical limitation in relevant research lies in its +predominant focus on gaming environments or simulated settings, thereby lacking +the representation of real-world driving scenarios. Therefore, we introduce +DriveDreamer, a pioneering world model entirely derived from real-world driving +scenarios. Regarding that modeling the world in intricate driving scenes +entails an overwhelming search space, we propose harnessing the powerful +diffusion model to construct a comprehensive representation of the complex +environment. Furthermore, we introduce a two-stage training pipeline. In the +initial phase, DriveDreamer acquires a deep understanding of structured traffic +constraints, while the subsequent stage equips it with the ability to +anticipate future states. The proposed DriveDreamer is the first world model +established from real-world driving scenarios. We instantiate DriveDreamer on +the challenging nuScenes benchmark, and extensive experiments verify that +DriveDreamer empowers precise, controllable video generation that faithfully +captures the structural constraints of real-world traffic scenarios. +Additionally, DriveDreamer enables the generation of realistic and reasonable +driving policies, opening avenues for interaction and practical applications. + +
+
+ comment: Project Page: https://drivedreamer.github.io +
+
+
+
+
+ + ♻ ☆ LanguageBind: Extending Video-Language Pretraining to N-modality by + Language-based Semantic Alignment ICLR 2024 + + +
+ The video-language (VL) pretraining has achieved remarkable improvement in +multiple downstream tasks. However, the current VL pretraining framework is +hard to extend to multiple modalities (N modalities, N>=3) beyond vision and +language. We thus propose LanguageBind, taking the language as the bind across +different modalities because the language modality is well-explored and +contains rich semantics. Specifically, we freeze the language encoder acquired +by VL pretraining, then train encoders for other modalities with contrastive +learning. As a result, all modalities are mapped to a shared feature space, +implementing multi-modal semantic alignment. While LanguageBind ensures that we +can extend VL modalities to N modalities, we also need a high-quality dataset +with alignment data pairs centered on language. We thus propose VIDAL-10M with +Video, Infrared, Depth, Audio and their corresponding Language, naming as +VIDAL-10M. In our VIDAL-10M, all videos are from short video platforms with +complete semantics rather than truncated segments from long videos, and all the +video, depth, infrared, and audio modalities are aligned to their textual +descriptions. After pretraining on VIDAL-10M, we outperform ImageBind by 5.8% +R@1 on the MSR-VTT dataset with only 15% of the parameters in the zero-shot +video-text retrieval task. Beyond this, our LanguageBind has greatly improved +in the zero-shot video, audio, depth, and infrared understanding tasks. For +instance, LanguageBind surpassing InterVideo by 1.9% on MSR-VTT, 8.8% on MSVD, +6.3% on DiDeMo, and 4.4% on ActivityNet. On the LLVIP and NYU-D datasets, +LanguageBind outperforms ImageBind with 23.8% and 11.1% top-1 accuracy. Code +address: https://github.com/PKU-YuanGroup/LanguageBind. + +
+
+ comment: Under review as a conference paper at ICLR 2024 +
+
+
+
+
+ + ♻ ☆ WordArt Designer: User-Driven Artistic Typography Synthesis using Large + Language Models EMNLP 2023 + + +
+ This paper introduces WordArt Designer, a user-driven framework for artistic +typography synthesis, relying on the Large Language Model (LLM). The system +incorporates four key modules: the LLM Engine, SemTypo, StyTypo, and TexTypo +modules. 1) The LLM Engine, empowered by the LLM (e.g., GPT-3.5), interprets +user inputs and generates actionable prompts for the other modules, thereby +transforming abstract concepts into tangible designs. 2) The SemTypo module +optimizes font designs using semantic concepts, striking a balance between +artistic transformation and readability. 3) Building on the semantic layout +provided by the SemTypo module, the StyTypo module creates smooth, refined +images. 4) The TexTypo module further enhances the design's aesthetics through +texture rendering, enabling the generation of inventive textured fonts. +Notably, WordArt Designer highlights the fusion of generative AI with artistic +typography. Experience its capabilities on ModelScope: +https://www.modelscope.cn/studios/WordArt/WordArt. + +
+
+ comment: Accepted by EMNLP 2023, 10 pages, 11 figures, 1 table, the system is + at https://www.modelscope.cn/studios/WordArt/WordArt +
+
+
+
+
+ + ♻ ☆ FedSoL: Bridging Global Alignment and Local Generality in Federated + Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ DeepSimHO: Stable Pose Estimation for Hand-Object Interaction via + Physics Simulation NeurIPS 2023 + + +
+ This paper addresses the task of 3D pose estimation for a hand interacting +with an object from a single image observation. When modeling hand-object +interaction, previous works mainly exploit proximity cues, while overlooking +the dynamical nature that the hand must stably grasp the object to counteract +gravity and thus preventing the object from slipping or falling. These works +fail to leverage dynamical constraints in the estimation and consequently often +produce unstable results. Meanwhile, refining unstable configurations with +physics-based reasoning remains challenging, both by the complexity of contact +dynamics and by the lack of effective and efficient physics inference in the +data-driven learning framework. To address both issues, we present DeepSimHO: a +novel deep-learning pipeline that combines forward physics simulation and +backward gradient approximation with a neural network. Specifically, for an +initial hand-object pose estimated by a base network, we forward it to a +physics simulator to evaluate its stability. However, due to non-smooth contact +geometry and penetration, existing differentiable simulators can not provide +reliable state gradient. To remedy this, we further introduce a deep network to +learn the stability evaluation process from the simulator, while smoothly +approximating its gradient and thus enabling effective back-propagation. +Extensive experiments show that our method noticeably improves the stability of +the estimation and achieves superior efficiency over test-time optimization. +The code is available at https://github.com/rongakowang/DeepSimHO. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ DocPedia: Unleashing the Power of Large Multimodal Model in the + Frequency Domain for Versatile Document Understanding + + +
+ This work presents DocPedia, a novel large multimodal model (LMM) for +versatile OCR-free document understanding, capable of parsing images up to +2,560$\times$2,560 resolution. Unlike existing work either struggle with +high-resolution documents or give up the large language model thus vision or +language ability constrained, our DocPedia directly processes visual input in +the frequency domain rather than the pixel space. The unique characteristic +enables DocPedia to capture a greater amount of visual and textual information +using a limited number of visual tokens. To consistently enhance both +perception and comprehension abilities of our model, we develop a dual-stage +training strategy and enrich instructions/annotations of all training tasks +covering multiple document types. Extensive quantitative and qualitative +experiments conducted on various publicly available benchmarks confirm the +mutual benefits of jointly learning perception and comprehension tasks. The +results provide further evidence of the effectiveness and superior performance +of our DocPedia over other methods. + +
+
+
+
+
+ + ♻ ☆ Animatable 3D Gaussians for High-fidelity Synthesis of Human Motions + + +
+ We present a novel animatable 3D Gaussian model for rendering high-fidelity +free-view human motions in real time. Compared to existing NeRF-based methods, +the model owns better capability in synthesizing high-frequency details without +the jittering problem across video frames. The core of our model is a novel +augmented 3D Gaussian representation, which attaches each Gaussian with a +learnable code. The learnable code serves as a pose-dependent appearance +embedding for refining the erroneous appearance caused by geometric +transformation of Gaussians, based on which an appearance refinement model is +learned to produce residual Gaussian properties to match the appearance in +target pose. To force the Gaussians to learn the foreground human only without +background interference, we further design a novel alpha loss to explicitly +constrain the Gaussians within the human body. We also propose to jointly +optimize the human joint parameters to improve the appearance accuracy. The +animatable 3D Gaussian model can be learned with shallow MLPs, so new human +motions can be synthesized in real time (66 fps on avarage). Experiments show +that our model has superior performance over NeRF-based methods. + +
+
+ comment: Some experiment data is wrong. The expression of the paper in + introduction and abstract is incorrect. Some graphs have inappropriate + descriptions +
+
+
+
+
+
+
+
+ + Information Retrieval 13 + +
+
+
+ + ☆ BioLORD-2023: Semantic Textual Representations Fusing LLM and Clinical + Knowledge Graph Insights + + +
+ In this study, we investigate the potential of Large Language Models to +complement biomedical knowledge graphs in the training of semantic models for +the biomedical and clinical domains. Drawing on the wealth of the UMLS +knowledge graph and harnessing cutting-edge Large Language Models, we propose a +new state-of-the-art approach for obtaining high-fidelity representations of +biomedical concepts and sentences, consisting of three steps: an improved +contrastive learning phase, a novel self-distillation phase, and a weight +averaging phase. Through rigorous evaluations via the extensive BioLORD testing +suite and diverse downstream tasks, we demonstrate consistent and substantial +performance improvements over the previous state of the art (e.g. +2pts on +MedSTS, +2.5pts on MedNLI-S, +6.1pts on EHR-Rel-B). Besides our new +state-of-the-art biomedical model for English, we also distill and release a +multilingual model compatible with 50+ languages and finetuned on 7 European +languages. Many clinical pipelines can benefit from our latest models. Our new +multilingual model enables a range of languages to benefit from our +advancements in biomedical semantic representation learning, opening a new +avenue for bioinformatics researchers around the world. As a result, we hope to +see BioLORD-2023 becoming a precious tool for future biomedical applications. + +
+
+ comment: Preprint of upcoming journal article +
+
+
+
+
+ + ☆ SEINE: SEgment-based Indexing for NEural information retrieval + + +
+ Many early neural Information Retrieval (NeurIR) methods are re-rankers that +rely on a traditional first-stage retriever due to expensive query time +computations. Recently, representation-based retrievers have gained much +attention, which learns query representation and document representation +separately, making it possible to pre-compute document representations offline +and reduce the workload at query time. Both dense and sparse +representation-based retrievers have been explored. However, these methods +focus on finding the representation that best represents a text (aka metric +learning) and the actual retrieval function that is responsible for similarity +matching between query and document is kept at a minimum by using dot product. +One drawback is that unlike traditional term-level inverted index, the index +formed by these embeddings cannot be easily re-used by another retrieval +method. Another drawback is that keeping the interaction at minimum hurts +retrieval effectiveness. On the contrary, interaction-based retrievers are +known for their better retrieval effectiveness. In this paper, we propose a +novel SEgment-based Neural Indexing method, SEINE, which provides a general +indexing framework that can flexibly support a variety of interaction-based +neural retrieval methods. We emphasize on a careful decomposition of common +components in existing neural retrieval methods and propose to use +segment-level inverted index to store the atomic query-document interaction +values. Experiments on LETOR MQ2007 and MQ2008 datasets show that our indexing +method can accelerate multiple neural retrieval methods up to 28-times faster +without sacrificing much effectiveness. + +
+
+
+
+
+ + ☆ A Social-aware Gaussian Pre-trained Model for Effective Cold-start + Recommendation + + +
+ The use of pre-training is an emerging technique to enhance a neural model's +performance, which has been shown to be effective for many neural language +models such as BERT. This technique has also been used to enhance the +performance of recommender systems. In such recommender systems, pre-training +models are used to learn a better initialisation for both users and items. +However, recent existing pre-trained recommender systems tend to only +incorporate the user interaction data at the pre-training stage, making it +difficult to deliver good recommendations, especially when the interaction data +is sparse. To alleviate this common data sparsity issue, we propose to +pre-train the recommendation model not only with the interaction data but also +with other available information such as the social relations among users, +thereby providing the recommender system with a better initialisation compared +with solely relying on the user interaction data. We propose a novel +recommendation model, the Social-aware Gaussian Pre-trained model (SGP), which +encodes the user social relations and interaction data at the pre-training +stage in a Graph Neural Network (GNN). Afterwards, in the subsequent +fine-tuning stage, our SGP model adopts a Gaussian Mixture Model (GMM) to +factorise these pre-trained embeddings for further training, thereby benefiting +the cold-start users from these pre-built social relations. Our extensive +experiments on three public datasets show that, in comparison to 16 competitive +baselines, our SGP model significantly outperforms the best baseline by upto +7.7% in terms of NDCG@10. In addition, we show that SGP permits to effectively +alleviate the cold-start problem, especially when users newly register to the +system through their friends' suggestions. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Justifiable Artificial Intelligence: Engineering Large Language Models + for Legal Applications + + +
+ In this work, I discuss how Large Language Models can be applied in the legal +domain, circumventing their current drawbacks. Despite their large success and +acceptance, their lack of explainability hinders legal experts to trust in +their output, and this happens rightfully so. However, in this paper, I argue +in favor of a new view, Justifiable Artificial Intelligence, instead of +focusing on Explainable Artificial Intelligence. I discuss in this paper how +gaining evidence for and against a Large Language Model's output may make their +generated texts more trustworthy - or hold them accountable for misinformation. + +
+
+ comment: 12 pages, 2 figures +
+
+
+
+
+ + ☆ Two Approaches to the Identity of Processes in BFO + + +
+ This paper aims to explore processes and their identity with a focus on the +upper ontology Basic Formal Ontology (BFO). We begin with a classification +based on two basic classes of changes of independent continuants: changes with +respect to a single specifically dependent continuant thereof or with respect +to the spatial region that its parts occupy. We accordingly distinguish two +kinds of simple processes: specifically dependent continuant changes and +spatial changes. Next, we investigate a compositional approach to the identity +of processes: the identity of any process is determined by the identity of the +simple processes that compose them. Then, we consider a causal approach to the +identity of processes with recourse to a dispositional view of processes +according to which any process is a realization of some disposition. We also +examine assumptions on which these two approaches to the identity of processes +are based. + +
+
+
+
+
+ + ☆ Experimental Analysis of Large-scale Learnable Vector Storage + Compression + + +
+ Learnable embedding vector is one of the most important applications in +machine learning, and is widely used in various database-related domains. +However, the high dimensionality of sparse data in recommendation tasks and the +huge volume of corpus in retrieval-related tasks lead to a large memory +consumption of the embedding table, which poses a great challenge to the +training and deployment of models. Recent research has proposed various methods +to compress the embeddings at the cost of a slight decrease in model quality or +the introduction of other overheads. Nevertheless, the relative performance of +these methods remains unclear. Existing experimental comparisons only cover a +subset of these methods and focus on limited metrics. In this paper, we perform +a comprehensive comparative analysis and experimental evaluation of embedding +compression. We introduce a new taxonomy that categorizes these techniques +based on their characteristics and methodologies, and further develop a modular +benchmarking framework that integrates 14 representative methods. Under a +uniform test environment, our benchmark fairly evaluates each approach, +presents their strengths and weaknesses under different memory budgets, and +recommends the best method based on the use case. In addition to providing +useful guidelines, our study also uncovers the limitations of current methods +and suggests potential directions for future research. + +
+
+
+
+
+ + ☆ Boot and Switch: Alternating Distillation for Zero-Shot Dense Retrieval EMNLP 2023 + + +
+ Neural 'dense' retrieval models are state of the art for many datasets, +however these models often exhibit limited domain transfer ability. Existing +approaches to adaptation are unwieldy, such as requiring explicit supervision, +complex model architectures, or massive external models. We present +$\texttt{ABEL}$, a simple but effective unsupervised method to enhance passage +retrieval in zero-shot settings. Our technique follows a straightforward loop: +a dense retriever learns from supervision signals provided by a reranker, and +subsequently, the reranker is updated based on feedback from the improved +retriever. By iterating this loop, the two components mutually enhance one +another's performance. Experimental results demonstrate that our unsupervised +$\texttt{ABEL}$ model outperforms both leading supervised and unsupervised +retrievers on the BEIR benchmark. Meanwhile, it exhibits strong adaptation +abilities to tasks and domains that were unseen during training. By either +fine-tuning $\texttt{ABEL}$ on labelled data or integrating it with existing +supervised dense retrievers, we achieve state-of-the-art +results.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/BootSwitch}.} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Noisy Self-Training with Synthetic Queries for Dense Retrieval EMNLP 2023 + + +
+ Although existing neural retrieval models reveal promising results when +training data is abundant and the performance keeps improving as training data +increases, collecting high-quality annotated data is prohibitively costly. To +this end, we introduce a novel noisy self-training framework combined with +synthetic queries, showing that neural retrievers can be improved in a +self-evolution manner with no reliance on any external models. Experimental +results show that our method improves consistently over existing methods on +both general-domain (e.g., MS-MARCO) and out-of-domain (i.e., BEIR) retrieval +benchmarks. Extra analysis on low-resource settings reveals that our method is +data efficient and outperforms competitive baselines, with as little as 30% of +labelled training data. Further extending the framework for reranker training +demonstrates that the proposed method is general and yields additional gains on +tasks of diverse domains.\footnote{Source code is available at +\url{https://github.com/Fantabulous-J/Self-Training-DPR}} + +
+
+ comment: Accepted by EMNLP 2023 Findings +
+
+
+
+
+ + ☆ UFIN: Universal Feature Interaction Network for Multi-Domain + Click-Through Rate Prediction + + +
+ Click-Through Rate (CTR) prediction, which aims to estimate the probability +of a user clicking on an item, is a key task in online advertising. Numerous +existing CTR models concentrate on modeling the feature interactions within a +solitary domain, thereby rendering them inadequate for fulfilling the +requisites of multi-domain recommendations in real industrial scenarios. Some +recent approaches propose intricate architectures to enhance knowledge sharing +and augment model training across multiple domains. However, these approaches +encounter difficulties when being transferred to new recommendation domains, +owing to their reliance on the modeling of ID features (e.g., item id). To +address the above issue, we propose the Universal Feature Interaction Network +(UFIN) approach for CTR prediction. UFIN exploits textual data to learn +universal feature interactions that can be effectively transferred across +diverse domains. For learning universal feature representations, we regard the +text and feature as two different modalities and propose an encoder-decoder +network founded on a Large Language Model (LLM) to enforce the transfer of data +from the text modality to the feature modality. Building upon the above +foundation, we further develop a mixtureof-experts (MoE) enhanced adaptive +feature interaction model to learn transferable collaborative patterns across +multiple domains. Furthermore, we propose a multi-domain knowledge distillation +framework to enhance feature interaction learning. Based on the above methods, +UFIN can effectively bridge the semantic gap to learn common knowledge across +various domains, surpassing the constraints of ID-based models. Extensive +experiments conducted on eight datasets show the effectiveness of UFIN, in both +multidomain and cross-platform settings. Our code is available at +https://github.com/RUCAIBox/UFIN. + +
+
+
+
+
+ + ♻ ☆ Thoroughly Modeling Multi-domain Pre-trained Recommendation as Language + + +
+ With the thriving of pre-trained language model (PLM) widely verified in +various of NLP tasks, pioneer efforts attempt to explore the possible +cooperation of the general textual information in PLM with the personalized +behavioral information in user historical behavior sequences to enhance +sequential recommendation (SR). However, despite the commonalities of input +format and task goal, there are huge gaps between the behavioral and textual +information, which obstruct thoroughly modeling SR as language modeling via +PLM. To bridge the gap, we propose a novel Unified pre-trained language model +enhanced sequential recommendation (UPSR), aiming to build a unified +pre-trained recommendation model for multi-domain recommendation tasks. We +formally design five key indicators, namely naturalness, domain consistency, +informativeness, noise & ambiguity, and text length, to guide the text-item +adaptation and behavior sequence-text sequence adaptation differently for +pre-training and fine-tuning stages, which are essential but under-explored by +previous works. In experiments, we conduct extensive evaluations on seven +datasets with both tuning and zero-shot settings and achieve the overall best +performance. Comprehensive model analyses also provide valuable insights for +behavior modeling via PLM, shedding light on large pre-trained recommendation +models. The source codes will be released in the future. + +
+
+
+
+
+ + ♻ ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon has elevated the issue of source bias in text retrieval for web +searches. Specifically, neural retrieval models tend to rank generated texts +higher than human-written texts. In this paper, we extend the study of this +bias to cross-modal retrieval. Firstly, we successfully construct a suitable +benchmark to explore the existence of the bias. Subsequent extensive +experiments on this benchmark reveal that AI-generated images introduce an +invisible relevance bias to text-image retrieval models. Specifically, our +experiments show that text-image retrieval models tend to rank the AI-generated +images higher than the real images, even though the AI-generated images do not +exhibit more visually relevant features to the query than real images. This +invisible relevance bias is prevalent across retrieval models with varying +training data and architectures. Furthermore, our subsequent exploration +reveals that the inclusion of AI-generated images in the training data of the +retrieval models exacerbates the invisible relevance bias. The above phenomenon +triggers a vicious cycle, which makes the invisible relevance bias become more +and more serious. To elucidate the potential causes of invisible relevance and +address the aforementioned issues, we introduce an effective training method +aimed at alleviating the invisible relevance bias. Subsequently, we apply our +proposed debiasing method to retroactively identify the causes of invisible +relevance, revealing that the AI-generated images induce the image encoder to +embed additional information into their representation. This information +exhibits a certain consistency across generated images with different semantics +and can make the retriever estimate a higher relevance score. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Prompt Tuning on Graph-augmented Low-resource Text Classification + + +
+ Text classification is a fundamental problem in information retrieval with +many real-world applications, such as predicting the topics of online articles +and the categories of e-commerce product descriptions. However, low-resource +text classification, with no or few labeled samples, presents a serious concern +for supervised learning. Meanwhile, many text data are inherently grounded on a +network structure, such as a hyperlink/citation network for online articles, +and a user-item purchase network for e-commerce products. These graph +structures capture rich semantic relationships, which can potentially augment +low-resource text classification. In this paper, we propose a novel model +called Graph-Grounded Pre-training and Prompting (G2P2) to address low-resource +text classification in a two-pronged approach. During pre-training, we propose +three graph interaction-based contrastive strategies to jointly pre-train a +graph-text model; during downstream classification, we explore handcrafted +discrete prompts and continuous prompt tuning for the jointly pre-trained model +to achieve zero- and few-shot classification, respectively. Moreover, we +explore the possibility of employing continuous prompt tuning for zero-shot +inference. Specifically, we aim to generalize continuous prompts to unseen +classes while leveraging a set of base classes. To this end, we extend G2P2 +into G2P2$^*$, hinging on a new architecture of conditional prompt tuning. +Extensive experiments on four real-world datasets demonstrate the strength of +G2P2 in zero- and few-shot low-resource text classification tasks, and +illustrate the advantage of G2P2$^*$ in dealing with unseen classes. + +
+
+ comment: 14 pages, journal under review. arXiv admin note: substantial text + overlap with arXiv:2305.03324 +
+
+
+
+
+ + ♻ ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose a novel method which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging (namely LM-Cocktail), where the fine-tuned language model +is merged with the pre-trained base model or the peer models from other domains +through weighted average. Despite simplicity, LM-Cocktail is surprisingly +effective: the resulted model is able to achieve a strong empirical performance +in the whole scope of general tasks while preserving a superior capacity in its +targeted domain. We conduct comprehensive experiments with LLama and BGE model +on popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding/tree/master/LM_Cocktail. + +
+
+
+
+
+
+
+
+ + Machine Learning 150 + +
+
+
+ + ☆ Test-time Adaptation of Discriminative Models via Diffusion Generative + Feedback NeurIPS 2023 + + +
+ The advancements in generative modeling, particularly the advent of diffusion +models, have sparked a fundamental question: how can these models be +effectively used for discriminative tasks? In this work, we find that +generative models can be great test-time adapters for discriminative models. +Our method, Diffusion-TTA, adapts pre-trained discriminative models such as +image classifiers, segmenters and depth predictors, to each unlabelled example +in the test set using generative feedback from a diffusion model. We achieve +this by modulating the conditioning of the diffusion model using the output of +the discriminative model. We then maximize the image likelihood objective by +backpropagating the gradients to discriminative model's parameters. We show +Diffusion-TTA significantly enhances the accuracy of various large-scale +pre-trained discriminative models, such as, ImageNet classifiers, CLIP models, +image pixel labellers and image depth predictors. Diffusion-TTA outperforms +existing test-time adaptation methods, including TTT-MAE and TENT, and +particularly shines in online adaptation setups, where the discriminative model +is continually adapted to each example in the test set. We provide access to +code, results, and visualizations on our website: +https://diffusion-tta.github.io/. + +
+
+ comment: Accepted at NeurIPS 2023 Webpage with Code: + https://diffusion-tta.github.io/ +
+
+
+
+
+ + ☆ How Many Unicorns Are in This Image? A Safety Evaluation Benchmark for + Vision LLMs SC + + +
+ This work focuses on the potential of Vision LLMs (VLLMs) in visual +reasoning. Different from prior studies, we shift our focus from evaluating +standard performance to introducing a comprehensive safety evaluation suite, +covering both out-of-distribution (OOD) generalization and adversarial +robustness. For the OOD evaluation, we present two novel VQA datasets, each +with one variant, designed to test model performance under challenging +conditions. In exploring adversarial robustness, we propose a straightforward +attack strategy for misleading VLLMs to produce visual-unrelated responses. +Moreover, we assess the efficacy of two jailbreaking strategies, targeting +either the vision or language component of VLLMs. Our evaluation of 21 diverse +models, ranging from open-source VLLMs to GPT-4V, yields interesting +observations: 1) Current VLLMs struggle with OOD texts but not images, unless +the visual information is limited; and 2) These VLLMs can be easily misled by +deceiving vision encoders only, and their vision-language training often +compromise safety protocols. We release this safety evaluation suite at +https://github.com/UCSC-VLAA/vllm-safety-benchmark. + +
+
+ comment: H.T., C.C., and Z.W. contribute equally. Work done during H.T. and + Z.W.'s internship at UCSC, and C.C. and Y.Z.'s internship at UNC +
+
+
+
+
+ + ☆ On Bringing Robots Home + + +
+ Throughout history, we have successfully integrated various machines into our +homes. Dishwashers, laundry machines, stand mixers, and robot vacuums are a few +recent examples. However, these machines excel at performing only a single task +effectively. The concept of a "generalist machine" in homes - a domestic +assistant that can adapt and learn from our needs, all while remaining +cost-effective - has long been a goal in robotics that has been steadily +pursued for decades. In this work, we initiate a large-scale effort towards +this goal by introducing Dobb-E, an affordable yet versatile general-purpose +system for learning robotic manipulation within household settings. Dobb-E can +learn a new task with only five minutes of a user showing it how to do it, +thanks to a demonstration collection tool ("The Stick") we built out of cheap +parts and iPhones. We use the Stick to collect 13 hours of data in 22 homes of +New York City, and train Home Pretrained Representations (HPR). Then, in a +novel home environment, with five minutes of demonstrations and fifteen minutes +of adapting the HPR model, we show that Dobb-E can reliably solve the task on +the Stretch, a mobile robot readily available on the market. Across roughly 30 +days of experimentation in homes of New York City and surrounding areas, we +test our system in 10 homes, with a total of 109 tasks in different +environments, and finally achieve a success rate of 81%. Beyond success +percentages, our experiments reveal a plethora of unique challenges absent or +ignored in lab robotics. These range from effects of strong shadows, to +variable demonstration quality by non-expert users. With the hope of +accelerating research on home robots, and eventually seeing robot butlers in +every home, we open-source Dobb-E software stack and models, our data, and our +hardware designs at https://dobb-e.com + +
+
+ comment: Project website and videos are available at https://dobb-e.com, + technical documentation for getting started is available at + https://docs.dobb-e.com, and code is released at + https://github.com/notmahi/dobb-e +
+
+
+
+
+ + ☆ Have we built machines that think like people? + + +
+ A chief goal of artificial intelligence is to build machines that think like +people. Yet it has been argued that deep neural network architectures fail to +accomplish this. Researchers have asserted these models' limitations in the +domains of causal reasoning, intuitive physics, and intuitive psychology. Yet +recent advancements, namely the rise of large language models, particularly +those designed for visual processing, have rekindled interest in the potential +to emulate human-like cognitive abilities. This paper evaluates the current +state of vision-based large language models in the domains of intuitive +physics, causal reasoning, and intuitive psychology. Through a series of +controlled experiments, we investigate the extent to which these modern models +grasp complex physical interactions, causal relationships, and intuitive +understanding of others' preferences. Our findings reveal that, while these +models demonstrate a notable proficiency in processing and interpreting visual +data, they still fall short of human capabilities in these areas. The models +exhibit a rudimentary understanding of physical laws and causal relationships, +but their performance is hindered by a lack of deeper insights-a key aspect of +human cognition. Furthermore, in tasks requiring an intuitive theory of mind, +the models fail altogether. Our results emphasize the need for integrating more +robust mechanisms for understanding causality, physical dynamics, and social +cognition into modern-day, vision-based language models, and point out the +importance of cognitively-inspired benchmarks. + +
+
+
+
+
+ + ☆ Interactive Autonomous Navigation with Internal State Inference and + Interactivity Estimation + + +
+ Deep reinforcement learning (DRL) provides a promising way for intelligent +agents (e.g., autonomous vehicles) to learn to navigate complex scenarios. +However, DRL with neural networks as function approximators is typically +considered a black box with little explainability and often suffers from +suboptimal performance, especially for autonomous navigation in highly +interactive multi-agent environments. To address these issues, we propose three +auxiliary tasks with spatio-temporal relational reasoning and integrate them +into the standard DRL framework, which improves the decision making performance +and provides explainable intermediate indicators. We propose to explicitly +infer the internal states (i.e., traits and intentions) of surrounding agents +(e.g., human drivers) as well as to predict their future trajectories in the +situations with and without the ego agent through counterfactual reasoning. +These auxiliary tasks provide additional supervision signals to infer the +behavior patterns of other interactive agents. Multiple variants of framework +integration strategies are compared. We also employ a spatio-temporal graph +neural network to encode relations between dynamic entities, which enhances +both internal state inference and decision making of the ego agent. Moreover, +we propose an interactivity estimation mechanism based on the difference +between predicted trajectories in these two situations, which indicates the +degree of influence of the ego agent on other agents. To validate the proposed +method, we design an intersection driving simulator based on the Intelligent +Intersection Driver Model (IIDM) that simulates vehicles and pedestrians. Our +approach achieves robust and state-of-the-art performance in terms of standard +evaluation metrics and provides explainable intermediate indicators (i.e., +internal states, and interactivity scores) for decision making. + +
+
+ comment: 18 pages, 14 figures +
+
+
+
+
+ + ☆ MAST: Model-Agnostic Sparsified Training + + +
+ We introduce a novel optimization problem formulation that departs from the +conventional way of minimizing machine learning model loss as a black-box +function. Unlike traditional formulations, the proposed approach explicitly +incorporates an initially pre-trained model and random sketch operators, +allowing for sparsification of both the model and gradient during training. We +establish insightful properties of the proposed objective function and +highlight its connections to the standard formulation. Furthermore, we present +several variants of the Stochastic Gradient Descent (SGD) method adapted to the +new problem formulation, including SGD with general sampling, a distributed +version, and SGD with variance reduction techniques. We achieve tighter +convergence rates and relax assumptions, bridging the gap between theoretical +principles and practical applications, covering several important techniques +such as Dropout and Sparse training. This work presents promising opportunities +to enhance the theoretical understanding of model training through a +sparsification-aware optimization approach. + +
+
+ comment: 58 pages, 5 figures +
+
+
+
+
+ + ☆ Transformer-QEC: Quantum Error Correction Code Decoding with + Transferable Transformers FAST + + +
+ Quantum computing has the potential to solve problems that are intractable +for classical systems, yet the high error rates in contemporary quantum devices +often exceed tolerable limits for useful algorithm execution. Quantum Error +Correction (QEC) mitigates this by employing redundancy, distributing quantum +information across multiple data qubits and utilizing syndrome qubits to +monitor their states for errors. The syndromes are subsequently interpreted by +a decoding algorithm to identify and correct errors in the data qubits. This +task is complex due to the multiplicity of error sources affecting both data +and syndrome qubits as well as syndrome extraction operations. Additionally, +identical syndromes can emanate from different error sources, necessitating a +decoding algorithm that evaluates syndromes collectively. Although machine +learning (ML) decoders such as multi-layer perceptrons (MLPs) and convolutional +neural networks (CNNs) have been proposed, they often focus on local syndrome +regions and require retraining when adjusting for different code distances. We +introduce a transformer-based QEC decoder which employs self-attention to +achieve a global receptive field across all input syndromes. It incorporates a +mixed loss training approach, combining both local physical error and global +parity label losses. Moreover, the transformer architecture's inherent +adaptability to variable-length inputs allows for efficient transfer learning, +enabling the decoder to adapt to varying code distances without retraining. + Evaluation on six code distances and ten different error configurations +demonstrates that our model consistently outperforms non-ML decoders, such as +Union Find (UF) and Minimum Weight Perfect Matching (MWPM), and other ML +decoders, thereby achieving best logical error rates. Moreover, the transfer +learning can save over 10x of training cost. + +
+
+ comment: Accepted to ICCAD 2023, FAST ML for Science Workshop; 7 pages, 8 + figures +
+
+
+
+
+ + ☆ XLB: Distributed Multi-GPU Lattice Boltzmann Simulation Framework for + Differentiable Scientific Machine Learning + + +
+ The lattice Boltzmann method (LBM) has emerged as a prominent technique for +solving fluid dynamics problems due to its algorithmic potential for +computational scalability. We introduce XLB framework, a Python-based +differentiable LBM library which harnesses the capabilities of the JAX +framework. The architecture of XLB is predicated upon ensuring accessibility, +extensibility, and computational performance, enabling scaling effectively +across CPU, multi-GPU, and distributed multi-GPU systems. The framework can be +readily augmented with novel boundary conditions, collision models, or +simulation capabilities. XLB offers the unique advantage of integration with +JAX's extensive machine learning echosystem, and the ability to utilize +automatic differentiation for tackling physics-based machine learning, +optimization, and inverse problems. XLB has been successfully scaled to handle +simulations with billions of cells, achieving giga-scale lattice updates per +second. XLB is released under the permissive Apache-2.0 license and is +available on GitHub at https://github.com/Autodesk/XLB. + +
+
+
+
+
+ + ☆ MEDITRON-70B: Scaling Medical Pretraining for Large Language Models + + +
+ Large language models (LLMs) can potentially democratize access to medical +knowledge. While many efforts have been made to harness and improve LLMs' +medical knowledge and reasoning capacities, the resulting models are either +closed-source (e.g., PaLM, GPT-4) or limited in scale (<= 13B parameters), +which restricts their abilities. In this work, we improve access to large-scale +medical LLMs by releasing MEDITRON: a suite of open-source LLMs with 7B and 70B +parameters adapted to the medical domain. MEDITRON builds on Llama-2 (through +our adaptation of Nvidia's Megatron-LM distributed trainer), and extends +pretraining on a comprehensively curated medical corpus, including selected +PubMed articles, abstracts, and internationally-recognized medical guidelines. +Evaluations using four major medical benchmarks show significant performance +gains over several state-of-the-art baselines before and after task-specific +finetuning. Overall, MEDITRON achieves a 6% absolute performance gain over the +best public baseline in its parameter class and 3% over the strongest baseline +we finetuned from Llama-2. Compared to closed-source LLMs, MEDITRON-70B +outperforms GPT-3.5 and Med-PaLM and is within 5% of GPT-4 and 10% of +Med-PaLM-2. We release our code for curating the medical pretraining corpus and +the MEDITRON model weights to drive open-source development of more capable +medical LLMs. + +
+
+
+
+
+ + ☆ A Survey on Vulnerability of Federated Learning: A Learning Algorithm + Perspective + + +
+ This review paper takes a comprehensive look at malicious attacks against FL, +categorizing them from new perspectives on attack origins and targets, and +providing insights into their methodology and impact. In this survey, we focus +on threat models targeting the learning process of FL systems. Based on the +source and target of the attack, we categorize existing threat models into four +types, Data to Model (D2M), Model to Data (M2D), Model to Model (M2M) and +composite attacks. For each attack type, we discuss the defense strategies +proposed, highlighting their effectiveness, assumptions and potential areas for +improvement. Defense strategies have evolved from using a singular metric to +excluding malicious clients, to employing a multifaceted approach examining +client models at various phases. In this survey paper, our research indicates +that the to-learn data, the learning gradients, and the learned model at +different stages all can be manipulated to initiate malicious attacks that +range from undermining model performance, reconstructing private local data, +and to inserting backdoors. We have also seen these threat are becoming more +insidious. While earlier studies typically amplified malicious gradients, +recent endeavors subtly alter the least significant weights in local models to +bypass defense measures. This literature review provides a holistic +understanding of the current FL threat landscape and highlights the importance +of developing robust, efficient, and privacy-preserving defenses to ensure the +safe and trusted adoption of FL in real-world applications. + +
+
+ comment: https://github.com/Rand2AI/Awesome-Vulnerability-of-Federated-Learning +
+
+
+
+
+ + ☆ Metric Space Magnitude for Evaluating Unsupervised Representation + Learning + + +
+ The magnitude of a metric space was recently established as a novel +invariant, providing a measure of the `effective size' of a space across +multiple scales. By capturing both geometrical and topological properties of +data, magnitude is poised to address challenges in unsupervised representation +learning tasks. We formalise a novel notion of dissimilarity between magnitude +functions of finite metric spaces and use them to derive a quality measure for +dimensionality reduction tasks. Our measure is provably stable under +perturbations of the data, can be efficiently calculated, and enables a +rigorous multi-scale comparison of embeddings. We show the utility of our +measure in an experimental suite that comprises different domains and tasks, +including the comparison of data visualisations. + +
+
+
+
+
+ + ☆ OccWorld: Learning a 3D Occupancy World Model for Autonomous Driving + + +
+ Understanding how the 3D scene evolves is vital for making decisions in +autonomous driving. Most existing methods achieve this by predicting the +movements of object boxes, which cannot capture more fine-grained scene +information. In this paper, we explore a new framework of learning a world +model, OccWorld, in the 3D Occupancy space to simultaneously predict the +movement of the ego car and the evolution of the surrounding scenes. We propose +to learn a world model based on 3D occupancy rather than 3D bounding boxes and +segmentation maps for three reasons: 1) expressiveness. 3D occupancy can +describe the more fine-grained 3D structure of the scene; 2) efficiency. 3D +occupancy is more economical to obtain (e.g., from sparse LiDAR points). 3) +versatility. 3D occupancy can adapt to both vision and LiDAR. To facilitate the +modeling of the world evolution, we learn a reconstruction-based scene +tokenizer on the 3D occupancy to obtain discrete scene tokens to describe the +surrounding scenes. We then adopt a GPT-like spatial-temporal generative +transformer to generate subsequent scene and ego tokens to decode the future +occupancy and ego trajectory. Extensive experiments on the widely used nuScenes +benchmark demonstrate the ability of OccWorld to effectively model the +evolution of the driving scenes. OccWorld also produces competitive planning +results without using instance and map supervision. Code: +https://github.com/wzzheng/OccWorld. + +
+
+ comment: Code is available at: https://github.com/wzzheng/OccWorld +
+
+
+
+
+ + ☆ RobustState: Boosting Fidelity of Quantum State Preparation via + Noise-Aware Variational Training FAST + + +
+ Quantum state preparation, a crucial subroutine in quantum computing, +involves generating a target quantum state from initialized qubits. Arbitrary +state preparation algorithms can be broadly categorized into arithmetic +decomposition (AD) and variational quantum state preparation (VQSP). AD employs +a predefined procedure to decompose the target state into a series of gates, +whereas VQSP iteratively tunes ansatz parameters to approximate target state. +VQSP is particularly apt for Noisy-Intermediate Scale Quantum (NISQ) machines +due to its shorter circuits. However, achieving noise-robust parameter +optimization still remains challenging. + We present RobustState, a novel VQSP training methodology that combines high +robustness with high training efficiency. The core idea involves utilizing +measurement outcomes from real machines to perform back-propagation through +classical simulators, thus incorporating real quantum noise into gradient +calculations. RobustState serves as a versatile, plug-and-play technique +applicable for training parameters from scratch or fine-tuning existing +parameters to enhance fidelity on target machines. It is adaptable to various +ansatzes at both gate and pulse levels and can even benefit other variational +algorithms, such as variational unitary synthesis. + Comprehensive evaluation of RobustState on state preparation tasks for 4 +distinct quantum algorithms using 10 real quantum machines demonstrates a +coherent error reduction of up to 7.1 $\times$ and state fidelity improvement +of up to 96\% and 81\% for 4-Q and 5-Q states, respectively. On average, +RobustState improves fidelity by 50\% and 72\% for 4-Q and 5-Q states compared +to baseline approaches. + +
+
+ comment: Accepted to FASTML @ ICCAD 2023. 14 pages, 20 figures +
+
+
+
+
+ + ☆ Machine Learning-Enhanced Aircraft Landing Scheduling under + Uncertainties + + +
+ This paper addresses aircraft delays, emphasizing their impact on safety and +financial losses. To mitigate these issues, an innovative machine learning +(ML)-enhanced landing scheduling methodology is proposed, aiming to improve +automation and safety. Analyzing flight arrival delay scenarios reveals strong +multimodal distributions and clusters in arrival flight time durations. A +multi-stage conditional ML predictor enhances separation time prediction based +on flight events. ML predictions are then integrated as safety constraints in a +time-constrained traveling salesman problem formulation, solved using +mixed-integer linear programming (MILP). Historical flight recordings and model +predictions address uncertainties between successive flights, ensuring +reliability. The proposed method is validated using real-world data from the +Atlanta Air Route Traffic Control Center (ARTCC ZTL). Case studies demonstrate +an average 17.2% reduction in total landing time compared to the +First-Come-First-Served (FCFS) rule. Unlike FCFS, the proposed methodology +considers uncertainties, instilling confidence in scheduling. The study +concludes with remarks and outlines future research directions. + +
+
+
+
+
+ + ☆ A Neural Framework for Generalized Causal Sensitivity Analysis + + +
+ Unobserved confounding is common in many applications, making causal +inference from observational data challenging. As a remedy, causal sensitivity +analysis is an important tool to draw causal conclusions under unobserved +confounding with mathematical guarantees. In this paper, we propose NeuralCSA, +a neural framework for generalized causal sensitivity analysis. Unlike previous +work, our framework is compatible with (i) a large class of sensitivity models, +including the marginal sensitivity model, f-sensitivity models, and Rosenbaum's +sensitivity model; (ii) different treatment types (i.e., binary and +continuous); and (iii) different causal queries, including (conditional) +average treatment effects and simultaneous effects on multiple outcomes. The +generality of \frameworkname is achieved by learning a latent distribution +shift that corresponds to a treatment intervention using two conditional +normalizing flows. We provide theoretical guarantees that NeuralCSA is able to +infer valid bounds on the causal query of interest and also demonstrate this +empirically using both simulated and real-world data. + +
+
+
+
+
+ + ☆ Scheduling and Communication Schemes for Decentralized Federated + Learning + + +
+ Federated learning (FL) is a distributed machine learning paradigm in which a +large number of clients coordinate with a central server to learn a model +without sharing their own training data. One central server is not enough, due +to problems of connectivity with clients. In this paper, a decentralized +federated learning (DFL) model with the stochastic gradient descent (SGD) +algorithm has been introduced, as a more scalable approach to improve the +learning performance in a network of agents with arbitrary topology. Three +scheduling policies for DFL have been proposed for communications between the +clients and the parallel servers, and the convergence, accuracy, and loss have +been tested in a totally decentralized mplementation of SGD. The experimental +results show that the proposed scheduling polices have an impact both on the +speed of convergence and in the final global model. + +
+
+ comment: 32nd International Conference on Computer Theory and Applications + (ICCTA), Alexandria, Egypt, 2022 +
+
+
+
+
+ + ☆ Using Decentralized Aggregation for Federated Learning with Differential + Privacy + + +
+ Nowadays, the ubiquitous usage of mobile devices and networks have raised +concerns about the loss of control over personal data and research advance +towards the trade-off between privacy and utility in scenarios that combine +exchange communications, big databases and distributed and collaborative (P2P) +Machine Learning techniques. On the other hand, although Federated Learning +(FL) provides some level of privacy by retaining the data at the local node, +which executes a local training to enrich a global model, this scenario is +still susceptible to privacy breaches as membership inference attacks. To +provide a stronger level of privacy, this research deploys an experimental +environment for FL with Differential Privacy (DP) using benchmark datasets. The +obtained results show that the election of parameters and techniques of DP is +central in the aforementioned trade-off between privacy and utility by means of +a classification example. + +
+
+
+
+
+ + ☆ Improved Data Generation for Enhanced Asset Allocation: A Synthetic + Dataset Approach for the Fixed Income Universe + + +
+ We present a novel process for generating synthetic datasets tailored to +assess asset allocation methods and construct portfolios within the fixed +income universe. Our approach begins by enhancing the CorrGAN model to generate +synthetic correlation matrices. Subsequently, we propose an Encoder-Decoder +model that samples additional data conditioned on a given correlation matrix. +The resulting synthetic dataset facilitates in-depth analyses of asset +allocation methods across diverse asset universes. Additionally, we provide a +case study that exemplifies the use of the synthetic dataset to improve +portfolios constructed within a simulation-based asset allocation process. + +
+
+
+
+
+ + ☆ Forecasting Auxiliary Energy Consumption for Electric Heavy-Duty + Vehicles + + +
+ Accurate energy consumption prediction is crucial for optimizing the +operation of electric commercial heavy-duty vehicles, e.g., route planning for +charging. Moreover, understanding why certain predictions are cast is paramount +for such a predictive model to gain user trust and be deployed in practice. +Since commercial vehicles operate differently as transportation tasks, ambient, +and drivers vary, a heterogeneous population is expected when building an AI +system for forecasting energy consumption. The dependencies between the input +features and the target values are expected to also differ across +sub-populations. One well-known example of such a statistical phenomenon is the +Simpson paradox. In this paper, we illustrate that such a setting poses a +challenge for existing XAI methods that produce global feature statistics, e.g. +LIME or SHAP, causing them to yield misleading results. We demonstrate a +potential solution by training multiple regression models on subsets of data. +It not only leads to superior regression performance but also more relevant and +consistent LIME explanations. Given that the employed groupings correspond to +relevant sub-populations, the associations between the input features and the +target values are consistent within each cluster but different across clusters. +Experiments on both synthetic and real-world datasets show that such splitting +of a complex problem into simpler ones yields better regression performance and +interpretability. + +
+
+
+
+
+ + ☆ Automated Measurement of Vascular Calcification in Femoral + Endarterectomy Patients Using Deep Learning + + +
+ Atherosclerosis, a chronic inflammatory disease affecting the large arteries, +presents a global health risk. Accurate analysis of diagnostic images, like +computed tomographic angiograms (CTAs), is essential for staging and monitoring +the progression of atherosclerosis-related conditions, including peripheral +arterial disease (PAD). However, manual analysis of CTA images is +time-consuming and tedious. To address this limitation, we employed a deep +learning model to segment the vascular system in CTA images of PAD patients +undergoing femoral endarterectomy surgery and to measure vascular calcification +from the left renal artery to the patella. Utilizing proprietary CTA images of +27 patients undergoing femoral endarterectomy surgery provided by Prisma Health +Midlands, we developed a Deep Neural Network (DNN) model to first segment the +arterial system, starting from the descending aorta to the patella, and second, +to provide a metric of arterial calcification. Our designed DNN achieved 83.4% +average Dice accuracy in segmenting arteries from aorta to patella, advancing +the state-of-the-art by 0.8%. Furthermore, our work is the first to present a +robust statistical analysis of automated calcification measurement in the lower +extremities using deep learning, attaining a Mean Absolute Percentage Error +(MAPE) of 9.5% and a correlation coefficient of 0.978 between automated and +manual calcification scores. These findings underscore the potential of deep +learning techniques as a rapid and accurate tool for medical professionals to +assess calcification in the abdominal aorta and its branches above the patella. +The developed DNN model and related documentation in this project are available +at GitHub page at https://github.com/pip-alireza/DeepCalcScoring. + +
+
+ comment: Published in MDPI Diagnostic journal, the code can be accessed via + the GitHub link in the paper +
+
+
+
+
+ + ☆ Closing the ODE-SDE gap in score-based diffusion models through the + Fokker-Planck equation + + +
+ Score-based diffusion models have emerged as one of the most promising +frameworks for deep generative modelling, due to their state-of-the art +performance in many generation tasks while relying on mathematical foundations +such as stochastic differential equations (SDEs) and ordinary differential +equations (ODEs). Empirically, it has been reported that ODE based samples are +inferior to SDE based samples. In this paper we rigorously describe the range +of dynamics and approximations that arise when training score-based diffusion +models, including the true SDE dynamics, the neural approximations, the various +approximate particle dynamics that result, as well as their associated +Fokker--Planck equations and the neural network approximations of these +Fokker--Planck equations. We systematically analyse the difference between the +ODE and SDE dynamics of score-based diffusion models, and link it to an +associated Fokker--Planck equation. We derive a theoretical upper bound on the +Wasserstein 2-distance between the ODE- and SDE-induced distributions in terms +of a Fokker--Planck residual. We also show numerically that conventional +score-based diffusion models can exhibit significant differences between ODE- +and SDE-induced distributions which we demonstrate using explicit comparisons. +Moreover, we show numerically that reducing the Fokker--Planck residual by +adding it as an additional regularisation term leads to closing the gap between +ODE- and SDE-induced distributions. Our experiments suggest that this +regularisation can improve the distribution generated by the ODE, however that +this can come at the cost of degraded SDE sample quality. + +
+
+
+
+
+ + ☆ Sensitivity-Based Layer Insertion for Residual and Feedforward Neural + Networks + + +
+ The training of neural networks requires tedious and often manual tuning of +the network architecture. We propose a systematic method to insert new layers +during the training process, which eliminates the need to choose a fixed +network size before training. Our technique borrows techniques from constrained +optimization and is based on first-order sensitivity information of the +objective with respect to the virtual parameters that additional layers, if +inserted, would offer. We consider fully connected feedforward networks with +selected activation functions as well as residual neural networks. In numerical +experiments, the proposed sensitivity-based layer insertion technique exhibits +improved training decay, compared to not inserting the layer. Furthermore, the +computational effort is reduced in comparison to inserting the layer from the +beginning. The code is available at +\url{https://github.com/LeonieKreis/layer_insertion_sensitivity_based}. + +
+
+
+
+
+ + ☆ Should We Learn Most Likely Functions or Parameters? NeurIPS 2023 + + +
+ Standard regularized training procedures correspond to maximizing a posterior +distribution over parameters, known as maximum a posteriori (MAP) estimation. +However, model parameters are of interest only insomuch as they combine with +the functional form of a model to provide a function that can make good +predictions. Moreover, the most likely parameters under the parameter posterior +do not generally correspond to the most likely function induced by the +parameter posterior. In fact, we can re-parametrize a model such that any +setting of parameters can maximize the parameter posterior. As an alternative, +we investigate the benefits and drawbacks of directly estimating the most +likely function implied by the model and the data. We show that this procedure +leads to pathological solutions when using neural networks and prove conditions +under which the procedure is well-behaved, as well as a scalable approximation. +Under these conditions, we find that function-space MAP estimation can lead to +flatter minima, better generalization, and improved robustness to overfitting. + +
+
+ comment: NeurIPS 2023. Code available at + https://github.com/activatedgeek/function-space-map +
+
+
+
+
+ + ☆ Sparsify-then-Classify: From Internal Neurons of Large Language Models + To Efficient Text Classifiers + + +
+ Among the many tasks that Large Language Models (LLMs) have revolutionized is +text classification. However, existing approaches for applying pretrained LLMs +to text classification predominantly rely on using single token outputs from +only the last layer of hidden states. As a result, they suffer from limitations +in efficiency, task-specificity, and interpretability. In our work, we +contribute an approach that uses all internal representations by employing +multiple pooling strategies on all activation and hidden states. Our novel +lightweight strategy, Sparsify-then-Classify (STC) first sparsifies +task-specific features layer-by-layer, then aggregates across layers for text +classification. STC can be applied as a seamless plug-and-play module on top of +existing LLMs. Our experiments on a comprehensive set of models and datasets +demonstrate that STC not only consistently improves the classification +performance of pretrained and fine-tuned models, but is also more efficient for +both training and inference, and is more intrinsically interpretable. + +
+
+ comment: 23 pages, 5 figures, 8 tables Code available at + https://github.com/difanj0713/Sparsify-then-Classify +
+
+
+
+
+ + ☆ Soil Organic Carbon Estimation from Climate-related Features with Graph + Neural Network + + +
+ Soil organic carbon (SOC) plays a pivotal role in the global carbon cycle, +impacting climate dynamics and necessitating accurate estimation for +sustainable land and agricultural management. While traditional methods of SOC +estimation face resolution and accuracy challenges, recent technological +solutions harness remote sensing, machine learning, and high-resolution +satellite mapping. Graph Neural Networks (GNNs), especially when integrated +with positional encoders, can capture complex relationships between soil and +climate. Using the LUCAS database, this study compared four GNN operators in +the positional encoder framework. Results revealed that the PESAGE and +PETransformer models outperformed others in SOC estimation, indicating their +potential in capturing the complex relationship between SOC and climate +features. Our findings confirm the feasibility of applications of GNN +architectures in SOC prediction, establishing a framework for future +explorations of this topic with more advanced GNN models. + +
+
+
+
+
+ + ☆ Towards Transfer Learning for Large-Scale Image Classification Using + Annealing-based Quantum Boltzmann Machines + + +
+ Quantum Transfer Learning (QTL) recently gained popularity as a hybrid +quantum-classical approach for image classification tasks by efficiently +combining the feature extraction capabilities of large Convolutional Neural +Networks with the potential benefits of Quantum Machine Learning (QML). +Existing approaches, however, only utilize gate-based Variational Quantum +Circuits for the quantum part of these procedures. In this work we present an +approach to employ Quantum Annealing (QA) in QTL-based image classification. +Specifically, we propose using annealing-based Quantum Boltzmann Machines as +part of a hybrid quantum-classical pipeline to learn the classification of +real-world, large-scale data such as medical images through supervised +training. We demonstrate our approach by applying it to the three-class +COVID-CT-MD dataset, a collection of lung Computed Tomography (CT) scan slices. +Using Simulated Annealing as a stand-in for actual QA, we compare our method to +classical transfer learning, using a neural network of the same order of +magnitude, to display its improved classification performance. We find that our +approach consistently outperforms its classical baseline in terms of test +accuracy and AUC-ROC-Score and needs less training epochs to do this. + +
+
+ comment: 7 pages, 3 figures (5 if counting subfigures), 1 table. To be + published in the proceedings of the 2023 IEEE International Conference on + Quantum Computing and Engineering (QCE) +
+
+
+
+
+ + ☆ Efficient Pre-training for Localized Instruction Generation of Videos + + +
+ Procedural videos show step-by-step demonstrations of tasks like recipe +preparation. Understanding such videos is challenging, involving the precise +localization of steps and the generation of textual instructions. Manually +annotating steps and writing instructions is costly, which limits the size of +current datasets and hinders effective learning. Leveraging large but noisy +video-transcript datasets for pre-training can boost performance, but demands +significant computational resources. Furthermore, transcripts contain +irrelevant content and exhibit style variation compared to instructions written +by human annotators. To mitigate both issues, we propose a technique, +Sieve-&-Swap, to automatically curate a smaller dataset: (i) Sieve filters +irrelevant transcripts and (ii) Swap enhances the quality of the text +instruction by automatically replacing the transcripts with human-written +instructions from a text-only recipe dataset. The curated dataset, three orders +of magnitude smaller than current web-scale datasets, enables efficient +training of large-scale models with competitive performance. We complement our +Sieve-\&-Swap approach with a Procedure Transformer (ProcX) for end-to-end step +localization and instruction generation for procedural videos. When this model +is pre-trained on our curated dataset, it achieves state-of-the-art performance +in zero-shot and finetuning settings on YouCook2 and Tasty, while using a +fraction of the computational resources. + +
+
+
+
+
+ + ☆ Maximum Likelihood Estimation is All You Need for Well-Specified + Covariate Shift + + +
+ A key challenge of modern machine learning systems is to achieve +Out-of-Distribution (OOD) generalization -- generalizing to target data whose +distribution differs from that of source data. Despite its significant +importance, the fundamental question of ``what are the most effective +algorithms for OOD generalization'' remains open even under the standard +setting of covariate shift. This paper addresses this fundamental question by +proving that, surprisingly, classical Maximum Likelihood Estimation (MLE) +purely using source data (without any modification) achieves the minimax +optimality for covariate shift under the well-specified setting. That is, no +algorithm performs better than MLE in this setting (up to a constant factor), +justifying MLE is all you need. Our result holds for a very rich class of +parametric models, and does not require any boundedness condition on the +density ratio. We illustrate the wide applicability of our framework by +instantiating it to three concrete examples -- linear regression, logistic +regression, and phase retrieval. This paper further complement the study by +proving that, under the misspecified setting, MLE is no longer the optimal +choice, whereas Maximum Weighted Likelihood Estimator (MWLE) emerges as minimax +optimal in certain scenarios. + +
+
+
+
+
+ + ☆ Addressing Long-Horizon Tasks by Integrating Program Synthesis and State + Machines + + +
+ Deep reinforcement learning excels in various domains but lacks +generalizability and interoperability. Programmatic RL methods (Trivedi et al., +2021; Liu et al., 2023) reformulate solving RL tasks as synthesizing +interpretable programs that can be executed in the environments. Despite +encouraging results, these methods are limited to short-horizon tasks. On the +other hand, representing RL policies using state machines (Inala et al., 2020) +can inductively generalize to long-horizon tasks; however, it struggles to +scale up to acquire diverse and complex behaviors. This work proposes Program +Machine Policies (POMPs), which bridge the advantages of programmatic RL and +state machine policies, allowing for the representation of complex behaviors +and the address of long-term tasks. Specifically, we introduce a method that +can retrieve a set of effective, diverse, compatible programs. Then, we use +these programs as modes of a state machine and learn a transition function to +transition among mode programs, allowing for capturing long-horizon repetitive +behaviors. Our proposed framework outperforms programmatic RL and deep RL +baselines on various tasks and demonstrates the ability to generalize to even +longer horizons without any fine-tuning inductively. Ablation studies justify +the effectiveness of our proposed search algorithm for retrieving a set of +programs as modes. + +
+
+
+
+
+ + ☆ Replay across Experiments: A Natural Extension of Off-Policy RL + + +
+ Replaying data is a principal mechanism underlying the stability and data +efficiency of off-policy reinforcement learning (RL). We present an effective +yet simple framework to extend the use of replays across multiple experiments, +minimally adapting the RL workflow for sizeable improvements in controller +performance and research iteration times. At its core, Replay Across +Experiments (RaE) involves reusing experience from previous experiments to +improve exploration and bootstrap learning while reducing required changes to a +minimum in comparison to prior work. We empirically show benefits across a +number of RL algorithms and challenging control domains spanning both +locomotion and manipulation, including hard exploration tasks from egocentric +vision. Through comprehensive ablations, we demonstrate robustness to the +quality and amount of data available and various hyperparameter choices. +Finally, we discuss how our approach can be applied more broadly across +research life cycles and can increase resilience by reloading data across +random seeds or hyperparameter variations. + +
+
+
+
+
+ + ☆ GloNets: Globally Connected Neural Networks + + +
+ Deep learning architectures suffer from depth-related performance +degradation, limiting the effective depth of neural networks. Approaches like +ResNet are able to mitigate this, but they do not completely eliminate the +problem. We introduce Globally Connected Neural Networks (GloNet), a novel +architecture overcoming depth-related issues, designed to be superimposed on +any model, enhancing its depth without increasing complexity or reducing +performance. With GloNet, the network's head uniformly receives information +from all parts of the network, regardless of their level of abstraction. This +enables GloNet to self-regulate information flow during training, reducing the +influence of less effective deeper layers, and allowing for stable training +irrespective of network depth. This paper details GloNet's design, its +theoretical basis, and a comparison with existing similar architectures. +Experiments show GloNet's self-regulation ability and resilience to +depth-related learning challenges, like performance degradation. Our findings +suggest GloNet as a strong alternative to traditional architectures like +ResNets. + +
+
+
+
+
+ + ☆ Over-Squashing in Riemannian Graph Neural Networks + + +
+ Most graph neural networks (GNNs) are prone to the phenomenon of +over-squashing in which node features become insensitive to information from +distant nodes in the graph. Recent works have shown that the topology of the +graph has the greatest impact on over-squashing, suggesting graph rewiring +approaches as a suitable solution. In this work, we explore whether +over-squashing can be mitigated through the embedding space of the GNN. In +particular, we consider the generalization of Hyperbolic GNNs (HGNNs) to +Riemannian manifolds of variable curvature in which the geometry of the +embedding space is faithful to the graph's topology. We derive bounds on the +sensitivity of the node features in these Riemannian GNNs as the number of +layers increases, which yield promising theoretical and empirical results for +alleviating over-squashing in graphs with negative curvature. + +
+
+
+
+
+ + ☆ Physics-informed neural networks for transformed geometries and + manifolds + + +
+ Physics-informed neural networks (PINNs) effectively embed physical +principles into machine learning, but often struggle with complex or +alternating geometries. We propose a novel method for integrating geometric +transformations within PINNs to robustly accommodate geometric variations. Our +method incorporates a diffeomorphism as a mapping of a reference domain and +adapts the derivative computation of the physics-informed loss function. This +generalizes the applicability of PINNs not only to smoothly deformed domains, +but also to lower-dimensional manifolds and allows for direct shape +optimization while training the network. We demonstrate the effectivity of our +approach on several problems: (i) Eikonal equation on Archimedean spiral, (ii) +Poisson problem on surface manifold, (iii) Incompressible Stokes flow in +deformed tube, and (iv) Shape optimization with Laplace operator. Through these +examples, we demonstrate the enhanced flexibility over traditional PINNs, +especially under geometric variations. The proposed framework presents an +outlook for training deep neural operators over parametrized geometries, paving +the way for advanced modeling with PDEs on complex geometries in science and +engineering. + +
+
+
+
+
+ + ☆ Towards Responsible Governance of Biological Design Tools NeurIPS 2023 + + +
+ Recent advancements in generative machine learning have enabled rapid +progress in biological design tools (BDTs) such as protein structure and +sequence prediction models. The unprecedented predictive accuracy and novel +design capabilities of BDTs present new and significant dual-use risks. For +example, their predictive accuracy allows biological agents, whether vaccines +or pathogens, to be developed more quickly, while the design capabilities could +be used to discover drugs or evade DNA screening techniques. Similar to other +dual-use AI systems, BDTs present a wicked problem: how can regulators uphold +public safety without stifling innovation? We highlight how current regulatory +proposals that are primarily tailored toward large language models may be less +effective for BDTs, which require fewer computational resources to train and +are often developed in an open-source manner. We propose a range of measures to +mitigate the risk that BDTs are misused, across the areas of responsible +development, risk assessment, transparency, access management, cybersecurity, +and investing in resilience. Implementing such measures will require close +coordination between developers and governments. + +
+
+ comment: 10 pages + references, 1 figure, accepted at NeurIPS 2023 Regulatable + ML as oral presentation +
+
+
+
+
+ + ☆ Reinforcement Learning for Wildfire Mitigation in Simulated Disaster + Environments NeurIPS 2023 + + +
+ Climate change has resulted in a year over year increase in adverse weather +and weather conditions which contribute to increasingly severe fire seasons. +Without effective mitigation, these fires pose a threat to life, property, +ecology, cultural heritage, and critical infrastructure. To better prepare for +and react to the increasing threat of wildfires, more accurate fire modelers +and mitigation responses are necessary. In this paper, we introduce SimFire, a +versatile wildland fire projection simulator designed to generate realistic +wildfire scenarios, and SimHarness, a modular agent-based machine learning +wrapper capable of automatically generating land management strategies within +SimFire to reduce the overall damage to the area. Together, this publicly +available system allows researchers and practitioners the ability to emulate +and assess the effectiveness of firefighter interventions and formulate +strategic plans that prioritize value preservation and resource allocation +optimization. The repositories are available for download at +https://github.com/mitrefireline. + +
+
+ comment: 12 pages, 4 figures including Appendices (A, B). Accepted as a paper + in the Proposals track at the "Tackling Climate Change with Machine Learning" + workshop at NeurIPS 2023. MITRE Public Release Case Number 23-3920 +
+
+
+
+
+ + ☆ Diagnosis driven Anomaly Detection for CPS + + +
+ In Cyber-Physical Systems (CPS) research, anomaly detection (detecting +abnormal behavior) and diagnosis (identifying the underlying root cause) are +often treated as distinct, isolated tasks. However, diagnosis algorithms +require symptoms, i.e. temporally and spatially isolated anomalies, as input. +Thus, anomaly detection and diagnosis must be developed together to provide a +holistic solution for diagnosis in CPS. We therefore propose a method for +utilizing deep learning-based anomaly detection to generate inputs for +Consistency-Based Diagnosis (CBD). We evaluate our approach on a simulated and +a real-world CPS dataset, where our model demonstrates strong performance +relative to other state-of-the-art models. + +
+
+
+
+
+ + ☆ MetaDefa: Meta-learning based on Domain Enhancement and Feature + Alignment for Single Domain Generalization + + +
+ The single domain generalization(SDG) based on meta-learning has emerged as +an effective technique for solving the domain-shift problem. However, the +inadequate match of data distribution between source and augmented domains and +difficult separation of domain-invariant features from domain-related features +make SDG model hard to achieve great generalization. Therefore, a novel +meta-learning method based on domain enhancement and feature alignment +(MetaDefa) is proposed to improve the model generalization performance. First, +the background substitution and visual corruptions techniques are used to +generate diverse and effective augmented domains. Then, the multi-channel +feature alignment module based on class activation maps and class agnostic +activation maps is designed to effectively extract adequate transferability +knowledge. In this module, domain-invariant features can be fully explored by +focusing on similar target regions between source and augmented domains feature +space and suppressing the feature representation of non-similar target regions. +Extensive experiments on two publicly available datasets show that MetaDefa has +significant generalization performance advantages in unknown multiple target +domains. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Stability-Informed Initialization of Neural Ordinary Differential + Equations + + +
+ This paper addresses the training of Neural Ordinary Differential Equations +(neural ODEs), and in particular explores the interplay between numerical +integration techniques, stability regions, step size, and initialization +techniques. It is shown how the choice of integration technique implicitly +regularizes the learned model, and how the solver's corresponding stability +region affects training and prediction performance. From this analysis, a +stability-informed parameter initialization technique is introduced. The +effectiveness of the initialization method is displayed across several learning +benchmarks and industrial applications. + +
+
+
+
+
+ + ☆ FLASC: A Flare-Sensitive Clustering Algorithm: Extending HDBSCAN* for + Detecting Branches in Clusters KDD + + +
+ We present FLASC, an algorithm for flare-sensitive clustering. Our algorithm +builds upon HDBSCAN* -- which provides high-quality density-based clustering +performance -- through a post-processing step that differentiates branches +within the detected clusters' manifold, adding a type of pattern that can be +discovered. Two variants of the algorithm are presented, which trade +computational cost for noise robustness. We show that both variants scale +similarly to HDBSCAN* in terms of computational cost and provide stable outputs +using synthetic data sets, resulting in an efficient flare-sensitive clustering +algorithm. In addition, we demonstrate the algorithm's benefit in data +exploration over HDBSCAN* clustering on two real-world data sets. + +
+
+ comment: 20 pages, 11 figures, submitted to ACM TKDD +
+
+
+
+
+ + ☆ RO-LLaMA: Generalist LLM for Radiation Oncology via Noise Augmentation + and Consistency Regularization + + +
+ Recent advancements in Artificial Intelligence (AI) have profoundly +influenced medical fields, by providing tools to reduce clinical workloads. +However, most AI models are constrained to execute uni-modal tasks, in stark +contrast to the comprehensive approaches utilized by medical professionals. To +address this, here we present RO-LLaMA, a versatile generalist large language +model (LLM) tailored for the field of radiation oncology. This model seamlessly +covers a wide range of the workflow of radiation oncologists, adept at various +tasks such as clinical report summarization, radiation therapy plan suggestion, +and plan-guided therapy target volume segmentation. In particular, to maximize +the end-to-end performance, we further present a novel Consistency Embedding +Fine-Tuning (CEFTune) technique, which boosts LLM's robustness to additional +errors at the intermediates while preserving the capability of handling clean +inputs, and creatively transform this concept into LLM-driven segmentation +framework as Consistency Embedding Segmentation (CESEG). Experimental results +on multi-centre cohort sets demonstrate our proposed RO-LLaMA's promising +performance for diverse tasks with generalization capabilities. + +
+
+
+
+
+ + ☆ Nodal Hydraulic Head Estimation through Unscented Kalman Filter for + Data-driven Leak Localization in Water Networks + + +
+ In this paper, we present a nodal hydraulic head estimation methodology for +water distribution networks (WDN) based on an Unscented Kalman Filter (UKF) +scheme with application to leak localization. The UKF refines an initial +estimation of the hydraulic state by considering the prediction model, as well +as available pressure and demand measurements. To this end, it provides +customized prediction and data assimilation steps. Additionally, the method is +enhanced by dynamically updating the prediction function weight matrices. +Performance testing on the Modena benchmark under realistic conditions +demonstrates the method's effectiveness in enhancing state estimation and +data-driven leak localization. + +
+
+ comment: This work has been submitted to IFAC for possible publication. It has + 6 pages and 3 figures +
+
+
+
+
+ + ☆ A precise symbolic emulator of the linear matter power spectrum + + +
+ Computing the matter power spectrum, $P(k)$, as a function of cosmological +parameters can be prohibitively slow in cosmological analyses, hence emulating +this calculation is desirable. Previous analytic approximations are +insufficiently accurate for modern applications, so black-box, uninterpretable +emulators are often used. We utilise an efficient genetic programming based +symbolic regression framework to explore the space of potential mathematical +expressions which can approximate the power spectrum and $\sigma_8$. We learn +the ratio between an existing low-accuracy fitting function for $P(k)$ and that +obtained by solving the Boltzmann equations and thus still incorporate the +physics which motivated this earlier approximation. We obtain an analytic +approximation to the linear power spectrum with a root mean squared fractional +error of 0.2% between $k = 9\times10^{-3} - 9 \, h{\rm \, Mpc^{-1}}$ and across +a wide range of cosmological parameters, and we provide physical +interpretations for various terms in the expression. We also provide a simple +analytic approximation for $\sigma_8$ with a similar accuracy, with a root mean +squared fractional error of just 0.4% when evaluated across the same range of +cosmologies. This function is easily invertible to obtain $A_{\rm s}$ as a +function of $\sigma_8$ and the other cosmological parameters, if preferred. It +is possible to obtain symbolic approximations to a seemingly complex function +at a precision required for current and future cosmological analyses without +resorting to deep-learning techniques, thus avoiding their black-box nature and +large number of parameters. Our emulator will be usable long after the codes on +which numerical approximations are built become outdated. + +
+
+ comment: 9 pages, 5 figures. Submitted to A&A +
+
+
+
+
+ + ☆ Multi-Agent Reinforcement Learning for Power Control in Wireless + Networks via Adaptive Graphs + + +
+ The ever-increasing demand for high-quality and heterogeneous wireless +communication services has driven extensive research on dynamic optimization +strategies in wireless networks. Among several possible approaches, multi-agent +deep reinforcement learning (MADRL) has emerged as a promising method to +address a wide range of complex optimization problems like power control. +However, the seamless application of MADRL to a variety of network optimization +problems faces several challenges related to convergence. In this paper, we +present the use of graphs as communication-inducing structures among +distributed agents as an effective means to mitigate these challenges. +Specifically, we harness graph neural networks (GNNs) as neural architectures +for policy parameterization to introduce a relational inductive bias in the +collective decision-making process. Most importantly, we focus on modeling the +dynamic interactions among sets of neighboring agents through the introduction +of innovative methods for defining a graph-induced framework for integrated +communication and learning. Finally, the superior generalization capabilities +of the proposed methodology to larger networks and to networks with different +user categories is verified through simulations. + +
+
+ comment: 6 pages, 4 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ A systematic study comparing hyperparameter optimization engines on + tabular data + + +
+ We run an independent comparison of all hyperparameter optimization +(hyperopt) engines available in the Ray Tune library. We introduce two ways to +normalize and aggregate statistics across data sets and models, one rank-based, +and another one sandwiching the score between the random search score and the +full grid search score. This affords us i) to rank the hyperopt engines, ii) to +make generalized and statistically significant statements on how much they +improve over random search, and iii) to make recommendations on which engine +should be used to hyperopt a given learning algorithm. We find that most +engines beat random search, but that only three of them (HEBO, AX, and +BlendSearch) clearly stand out. We also found that some engines seem to +specialize in hyperopting certain learning algorithms, which makes it tricky to +use hyperopt in comparison studies, since the choice of the hyperopt technique +may favor some of the models in the comparison. + +
+
+
+
+
+ + ☆ Cell Maps Representation For Lung Adenocarcinoma Growth Patterns + Classification In Whole Slide Images + + +
+ Lung adenocarcinoma is a morphologically heterogeneous disease, characterized +by five primary histologic growth patterns. The quantity of these patterns can +be related to tumor behavior and has a significant impact on patient prognosis. +In this work, we propose a novel machine learning pipeline capable of +classifying tissue tiles into one of the five patterns or as non-tumor, with an +Area Under the Receiver Operating Characteristic Curve (AUCROC) score of 0.97. +Our model's strength lies in its comprehensive consideration of cellular +spatial patterns, where it first generates cell maps from Hematoxylin and Eosin +(H&E) whole slide images (WSIs), which are then fed into a convolutional neural +network classification model. Exploiting these cell maps provides the model +with robust generalizability to new data, achieving approximately 30% higher +accuracy on unseen test-sets compared to current state of the art approaches. +The insights derived from our model can be used to predict prognosis, enhancing +patient outcomes. + +
+
+
+
+
+ + ☆ Utilizing Explainability Techniques for Reinforcement Learning Model + Assurance NeurIPS 2023 + + +
+ Explainable Reinforcement Learning (XRL) can provide transparency into the +decision-making process of a Deep Reinforcement Learning (DRL) model and +increase user trust and adoption in real-world use cases. By utilizing XRL +techniques, researchers can identify potential vulnerabilities within a trained +DRL model prior to deployment, therefore limiting the potential for mission +failure or mistakes by the system. This paper introduces the ARLIN (Assured RL +Model Interrogation) Toolkit, an open-source Python library that identifies +potential vulnerabilities and critical points within trained DRL models through +detailed, human-interpretable explainability outputs. To illustrate ARLIN's +effectiveness, we provide explainability visualizations and vulnerability +analysis for a publicly available DRL model. The open-source code repository is +available for download at https://github.com/mitre/arlin. + +
+
+ comment: 9 pages, 8 figures including appendices (A, B, C). Accepted as a + poster presentation in the demo track at the "XAI in Action: Past, Present, + and Future Applications" workshop at NeurIPS 2023. MITRE Public Release Case + Number 23-3095 +
+
+
+
+
+ + ☆ Temporal Action Localization for Inertial-based Human Activity + Recognition + + +
+ A persistent trend in Deep Learning has been the applicability of machine +learning concepts to other areas than originally introduced for. As of today, +state-of-the-art activity recognition from wearable sensors relies on +classifiers being trained on fixed windows of data. Contrarily, video-based +Human Activity Recognition has followed a segment-based prediction approach, +localizing activity occurrences from start to end. This paper is the first to +systematically demonstrate the applicability of state-of-the-art TAL models for +wearable Human Activity Recongition (HAR) using raw inertial data as input. Our +results show that state-of-the-art TAL models are able to outperform popular +inertial models on 4 out of 6 wearable activity recognition benchmark datasets, +with improvements ranging as much as 25% in F1-score. Introducing the TAL +community's most popular metric to inertial-based HAR, namely mean Average +Precision, our analysis shows that TAL models are able to produce more coherent +segments along with an overall higher NULL-class accuracy across all datasets. +Being the first to provide such an analysis, the TAL community offers an +interesting new perspective to inertial-based HAR with yet to be explored +design choices and training concepts, which could be of significant value for +the inertial-based HAR community. + +
+
+ comment: 20 pages, 7 figures, 2 tables +
+
+
+
+
+ + ☆ Scale-Dropout: Estimating Uncertainty in Deep Neural Networks Using + Stochastic Scale + + +
+ Uncertainty estimation in Neural Networks (NNs) is vital in improving +reliability and confidence in predictions, particularly in safety-critical +applications. Bayesian Neural Networks (BayNNs) with Dropout as an +approximation offer a systematic approach to quantifying uncertainty, but they +inherently suffer from high hardware overhead in terms of power, memory, and +computation. Thus, the applicability of BayNNs to edge devices with limited +resources or to high-performance applications is challenging. Some of the +inherent costs of BayNNs can be reduced by accelerating them in hardware on a +Computation-In-Memory (CIM) architecture with spintronic memories and +binarizing their parameters. However, numerous stochastic units are required to +implement conventional dropout-based BayNN. In this paper, we propose the Scale +Dropout, a novel regularization technique for Binary Neural Networks (BNNs), +and Monte Carlo-Scale Dropout (MC-Scale Dropout)-based BayNNs for efficient +uncertainty estimation. Our approach requires only one stochastic unit for the +entire model, irrespective of the model size, leading to a highly scalable +Bayesian NN. Furthermore, we introduce a novel Spintronic memory-based CIM +architecture for the proposed BayNN that achieves more than $100\times$ energy +savings compared to the state-of-the-art. We validated our method to show up to +a $1\%$ improvement in predictive performance and superior uncertainty +estimates compared to related works. + +
+
+
+
+
+ + ☆ Exploring Artificial Intelligence Methods for Energy Prediction in + Healthcare Facilities: An In-Depth Extended Systematic Review + + +
+ Hospitals, due to their complexity and unique requirements, play a pivotal +role in global energy consumption patterns. This study conducted a +comprehensive literature review, utilizing the PRISMA framework, of articles +that employed machine learning and artificial intelligence techniques for +predicting energy consumption in hospital buildings. Of the 1884 publications +identified, 17 were found to address this specific domain and have been +thoroughly reviewed to establish the state-of-the-art and identify gaps where +future research is needed. This review revealed a diverse range of data inputs +influencing energy prediction, with occupancy and meteorological data emerging +as significant predictors. However, many studies failed to delve deep into the +implications of their data choices, and gaps were evident regarding the +understanding of time dynamics, operational status, and preprocessing methods. +Machine learning, especially deep learning models like ANNs, have shown +potential in this domain, yet they come with challenges, including +interpretability and computational demands. The findings underscore the immense +potential of AI in optimizing hospital energy consumption but also highlight +the need for more comprehensive and granular research. Key areas for future +research include the optimization of ANN approaches, new optimization and data +integration techniques, the integration of real-time data into Intelligent +Energy Management Systems, and increasing focus on long-term energy +forecasting. + +
+
+ comment: 38 pages, 1 figure, 3 tables, systematic literature review +
+
+
+
+
+ + ☆ Rethinking Privacy in Machine Learning Pipelines from an Information + Flow Control Perspective + + +
+ Modern machine learning systems use models trained on ever-growing corpora. +Typically, metadata such as ownership, access control, or licensing information +is ignored during training. Instead, to mitigate privacy risks, we rely on +generic techniques such as dataset sanitization and differentially private +model training, with inherent privacy/utility trade-offs that hurt model +performance. Moreover, these techniques have limitations in scenarios where +sensitive information is shared across multiple participants and fine-grained +access control is required. By ignoring metadata, we therefore miss an +opportunity to better address security, privacy, and confidentiality +challenges. In this paper, we take an information flow control perspective to +describe machine learning systems, which allows us to leverage metadata such as +access control policies and define clear-cut privacy and confidentiality +guarantees with interpretable information flows. Under this perspective, we +contrast two different approaches to achieve user-level non-interference: 1) +fine-tuning per-user models, and 2) retrieval augmented models that access +user-specific datasets at inference time. We compare these two approaches to a +trivially non-interfering zero-shot baseline using a public model and to a +baseline that fine-tunes this model on the whole corpus. We evaluate trained +models on two datasets of scientific articles and demonstrate that retrieval +augmented architectures deliver the best utility, scalability, and flexibility +while satisfying strict non-interference guarantees. + +
+
+
+
+
+ + ☆ Relationship between Model Compression and Adversarial Robustness: A + Review of Current Evidence SC + + +
+ Increasing the model capacity is a known approach to enhance the adversarial +robustness of deep learning networks. On the other hand, various model +compression techniques, including pruning and quantization, can reduce the size +of the network while preserving its accuracy. Several recent studies have +addressed the relationship between model compression and adversarial +robustness, while some experiments have reported contradictory results. This +work summarizes available evidence and discusses possible explanations for the +observed effects. + +
+
+ comment: Accepted for publication at SSCI 2023 +
+
+
+
+
+ + ☆ Increasing Coverage and Precision of Textual Information in Multilingual + Knowledge Graphs EMNLP 2023 + + +
+ Recent work in Natural Language Processing and Computer Vision has been using +textual information -- e.g., entity names and descriptions -- available in +knowledge graphs to ground neural models to high-quality structured data. +However, when it comes to non-English languages, the quantity and quality of +textual information are comparatively scarce. To address this issue, we +introduce the novel task of automatic Knowledge Graph Enhancement (KGE) and +perform a thorough investigation on bridging the gap in both the quantity and +quality of textual information between English and non-English languages. More +specifically, we: i) bring to light the problem of increasing multilingual +coverage and precision of entity names and descriptions in Wikidata; ii) +demonstrate that state-of-the-art methods, namely, Machine Translation (MT), +Web Search (WS), and Large Language Models (LLMs), struggle with this task; +iii) present M-NTA, a novel unsupervised approach that combines MT, WS, and +LLMs to generate high-quality textual information; and, iv) study the impact of +increasing multilingual coverage and precision of non-English textual +information in Entity Linking, Knowledge Graph Completion, and Question +Answering. As part of our effort towards better multilingual knowledge graphs, +we also introduce WikiKGE-10, the first human-curated benchmark to evaluate KGE +approaches in 10 languages across 7 language families. + +
+
+ comment: Camera ready for EMNLP 2023 +
+
+
+
+
+ + ☆ Attend Who is Weak: Enhancing Graph Condensation via Cross-Free + Adversarial Training + + +
+ In this paper, we study the \textit{graph condensation} problem by +compressing the large, complex graph into a concise, synthetic representation +that preserves the most essential and discriminative information of structure +and features. We seminally propose the concept of Shock Absorber (a type of +perturbation) that enhances the robustness and stability of the original graphs +against changes in an adversarial training fashion. Concretely, (I) we forcibly +match the gradients between pre-selected graph neural networks (GNNs) trained +on a synthetic, simplified graph and the original training graph at regularly +spaced intervals. (II) Before each update synthetic graph point, a Shock +Absorber serves as a gradient attacker to maximize the distance between the +synthetic dataset and the original graph by selectively perturbing the parts +that are underrepresented or insufficiently informative. We iteratively repeat +the above two processes (I and II) in an adversarial training fashion to +maintain the highly-informative context without losing correlation with the +original dataset. More importantly, our shock absorber and the synthesized +graph parallelly share the backward process in a free training manner. Compared +to the original adversarial training, it introduces almost no additional time +overhead. + We validate our framework across 8 datasets (3 graph and 5 node +classification datasets) and achieve prominent results: for example, on Cora, +Citeseer and Ogbn-Arxiv, we can gain nearly 1.13% to 5.03% improvements compare +with SOTA models. Moreover, our algorithm adds only about 0.2% to 2.2% +additional time overhead over Flicker, Citeseer and Ogbn-Arxiv. Compared to the +general adversarial training, our approach improves time efficiency by nearly +4-fold. + +
+
+
+
+
+ + ☆ Learning Multi-Frequency Partial Correlation Graphs + + +
+ Despite the large research effort devoted to learning dependencies between +time series, the state of the art still faces a major limitation: existing +methods learn partial correlations but fail to discriminate across distinct +frequency bands. Motivated by many applications in which this differentiation +is pivotal, we overcome this limitation by learning a block-sparse, +frequency-dependent, partial correlation graph, in which layers correspond to +different frequency bands, and partial correlations can occur over just a few +layers. To this aim, we formulate and solve two nonconvex learning problems: +the first has a closed-form solution and is suitable when there is prior +knowledge about the number of partial correlations; the second hinges on an +iterative solution based on successive convex approximation, and is effective +for the general case where no prior knowledge is available. Numerical results +on synthetic data show that the proposed methods outperform the current state +of the art. Finally, the analysis of financial time series confirms that +partial correlations exist only within a few frequency bands, underscoring how +our methods enable the gaining of valuable insights that would be undetected +without discriminating along the frequency domain. + +
+
+
+
+
+ + ☆ Adinkra Symbol Recognition using Classical Machine Learning and Deep + Learning + + +
+ Artificial intelligence (AI) has emerged as a transformative influence, +engendering paradigm shifts in global societies, spanning academia and +industry. However, in light of these rapid advances, addressing the +underrepresentation of black communities and African countries in AI is +crucial. Boosting enthusiasm for AI can be effectively accomplished by +showcasing straightforward applications around tasks like identifying and +categorizing traditional symbols, such as Adinkra symbols, or familiar objects +within the community. In this research endeavor, we dived into classical +machine learning and harnessed the power of deep learning models to tackle the +intricate task of classifying and recognizing Adinkra symbols. The idea led to +a newly constructed ADINKRA dataset comprising 174,338 images meticulously +organized into 62 distinct classes, each representing a singular and emblematic +symbol. We constructed a CNN model for classification and recognition using six +convolutional layers, three fully connected (FC) layers, and optional dropout +regularization. The model is a simpler and smaller version of VGG, with fewer +layers, smaller channel sizes, and a fixed kernel size. Additionally, we tap +into the transfer learning capabilities provided by pre-trained models like VGG +and ResNet. These models assist us in both classifying images and extracting +features that can be used with classical machine learning models. We assess the +model's performance by measuring its accuracy and convergence rate and +visualizing the areas that significantly influence its predictions. These +evaluations serve as a foundational benchmark for future assessments of the +ADINKRA dataset. We hope this application exemplar inspires ideas on the +various uses of AI in organizing our traditional and modern lives. + +
+
+ comment: 15 pages, 6 figures, 5 tables +
+
+
+
+
+ + ☆ GLIME: General, Stable and Local LIME Explanation NeurIPS 2023 + + +
+ As black-box machine learning models grow in complexity and find applications +in high-stakes scenarios, it is imperative to provide explanations for their +predictions. Although Local Interpretable Model-agnostic Explanations (LIME) +[22] is a widely adpoted method for understanding model behaviors, it is +unstable with respect to random seeds [35,24,3] and exhibits low local fidelity +(i.e., how well the explanation approximates the model's local behaviors) +[21,16]. Our study shows that this instability problem stems from small sample +weights, leading to the dominance of regularization and slow convergence. +Additionally, LIME's sampling neighborhood is non-local and biased towards the +reference, resulting in poor local fidelity and sensitivity to reference +choice. To tackle these challenges, we introduce GLIME, an enhanced framework +extending LIME and unifying several prior methods. Within the GLIME framework, +we derive an equivalent formulation of LIME that achieves significantly faster +convergence and improved stability. By employing a local and unbiased sampling +distribution, GLIME generates explanations with higher local fidelity compared +to LIME. GLIME explanations are independent of reference choice. Moreover, +GLIME offers users the flexibility to choose a sampling distribution based on +their specific scenarios. + +
+
+ comment: Accepted by NeurIPS 2023 as a Spotlight paper +
+
+
+
+
+ + ☆ Variational Autoencoders for Feature Exploration and Malignancy + Prediction of Lung Lesions BMVC 2023 + + +
+ Lung cancer is responsible for 21% of cancer deaths in the UK and five-year +survival rates are heavily influenced by the stage the cancer was identified +at. Recent studies have demonstrated the capability of AI methods for accurate +and early diagnosis of lung cancer from routine scans. However, this evidence +has not translated into clinical practice with one barrier being a lack of +interpretable models. This study investigates the application Variational +Autoencoders (VAEs), a type of generative AI model, to lung cancer lesions. +Proposed models were trained on lesions extracted from 3D CT scans in the +LIDC-IDRI public dataset. Latent vector representations of 2D slices produced +by the VAEs were explored through clustering to justify their quality and used +in an MLP classifier model for lung cancer diagnosis, the best model achieved +state-of-the-art metrics of AUC 0.98 and 93.1% accuracy. Cluster analysis shows +the VAE latent space separates the dataset of malignant and benign lesions +based on meaningful feature components including tumour size, shape, patient +and malignancy class. We also include a comparative analysis of the standard +Gaussian VAE (GVAE) and the more recent Dirichlet VAE (DirVAE), which replaces +the prior with a Dirichlet distribution to encourage a more explainable latent +space with disentangled feature representation. Finally, we demonstrate the +potential for latent space traversals corresponding to clinically meaningful +feature changes. + +
+
+ comment: 10 pages (main paper), 5 pages (references), 5 figures, 2 tables, + work accepted for BMVC 2023 +
+
+
+
+
+ + ☆ Tabular Two-Dimensional Correlation Analysis for Multifaceted + Characterization Data + + +
+ We propose tabular two-dimensional correlation analysis for extracting +features from multifaceted characterization data, essential for understanding +material properties. This method visualizes similarities and phase lags in +structural parameter changes through heatmaps, combining hierarchical +clustering and asynchronous correlations. We applied the proposed method to +datasets of carbon nanotube (CNTs) films annealed at various temperatures and +revealed the complexity of their hierarchical structures, which include +elements like voids, bundles, and amorphous carbon. Our analysis addresses the +challenge of attempting to understand the sequence of structural changes, +especially in multifaceted characterization data where 11 structural parameters +derived from 8 characterization methods interact with complex behavior. The +results show how phase lags (asynchronous changes from stimuli) and parameter +similarities can illuminate the sequence of structural changes in materials, +providing insights into phenomena like the removal of amorphous carbon and +graphitization in annealed CNTs. This approach is beneficial even with limited +data and holds promise for a wide range of material analyses, demonstrating its +potential in elucidating complex material behaviors and properties. + +
+
+ comment: 15 pages, 4 figures +
+
+
+
+
+ + ☆ Peptide Binding Classification on Quantum Computers + + +
+ We conduct an extensive study on using near-term quantum computers for a task +in the domain of computational biology. By constructing quantum models based on +parameterised quantum circuits we perform sequence classification on a task +relevant to the design of therapeutic proteins, and find competitive +performance with classical baselines of similar scale. To study the effect of +noise, we run some of the best-performing quantum models with favourable +resource requirements on emulators of state-of-the-art noisy quantum +processors. We then apply error mitigation methods to improve the signal. We +further execute these quantum models on the Quantinuum H1-1 trapped-ion quantum +processor and observe very close agreement with noiseless exact simulation. +Finally, we perform feature attribution methods and find that the quantum +models indeed identify sensible relationships, at least as well as the +classical baselines. This work constitutes the first proof-of-concept +application of near-term quantum computing to a task critical to the design of +therapeutic proteins, opening the route toward larger-scale applications in +this and related fields, in line with the hardware development roadmaps of +near-term quantum technologies. + +
+
+
+
+
+ + ☆ Automated discovery of trade-off between utility, privacy and fairness + in machine learning models ECML 2023 + + +
+ Machine learning models are deployed as a central component in decision +making and policy operations with direct impact on individuals' lives. In order +to act ethically and comply with government regulations, these models need to +make fair decisions and protect the users' privacy. However, such requirements +can come with decrease in models' performance compared to their potentially +biased, privacy-leaking counterparts. Thus the trade-off between fairness, +privacy and performance of ML models emerges, and practitioners need a way of +quantifying this trade-off to enable deployment decisions. In this work we +interpret this trade-off as a multi-objective optimization problem, and propose +PFairDP, a pipeline that uses Bayesian optimization for discovery of +Pareto-optimal points between fairness, privacy and utility of ML models. We +show how PFairDP can be used to replicate known results that were achieved +through manual constraint setting process. We further demonstrate effectiveness +of PFairDP with experiments on multiple models and datasets. + +
+
+ comment: 3rd Workshop on Bias and Fairness in AI (BIAS), ECML 2023 +
+
+
+
+
+ + ☆ The Battleship Approach to the Low Resource Entity Matching Problem + + +
+ Entity matching, a core data integration problem, is the task of deciding +whether two data tuples refer to the same real-world entity. Recent advances in +deep learning methods, using pre-trained language models, were proposed for +resolving entity matching. Although demonstrating unprecedented results, these +solutions suffer from a major drawback as they require large amounts of labeled +data for training, and, as such, are inadequate to be applied to low resource +entity matching problems. To overcome the challenge of obtaining sufficient +labeled data we offer a new active learning approach, focusing on a selection +mechanism that exploits unique properties of entity matching. We argue that a +distributed representation of a tuple pair indicates its informativeness when +considered among other pairs. This is used consequently in our approach that +iteratively utilizes space-aware considerations. Bringing it all together, we +treat the low resource entity matching problem as a Battleship game, hunting +indicative samples, focusing on positive ones, through awareness of the latent +space along with careful planning of next sampling iterations. An extensive +experimental analysis shows that the proposed algorithm outperforms +state-of-the-art active learning solutions to low resource entity matching, and +although using less samples, can be as successful as state-of-the-art fully +trained known algorithms. + +
+
+
+
+
+ + ☆ Information theoretic study of the neural geometry induced by category + learning NeurIPS 2023 + + +
+ Categorization is an important topic both for biological and artificial +neural networks. Here, we take an information theoretic approach to assess the +efficiency of the representations induced by category learning. We show that +one can decompose the relevant Bayesian cost into two components, one for the +coding part and one for the decoding part. Minimizing the coding cost implies +maximizing the mutual information between the set of categories and the neural +activities. We analytically show that this mutual information can be written as +the sum of two terms that can be interpreted as (i) finding an appropriate +representation space, and, (ii) building a representation with the appropriate +metrics, based on the neural Fisher information on this space. One main +consequence is that category learning induces an expansion of neural space near +decision boundaries. Finally, we provide numerical illustrations that show how +Fisher information of the coding neural population aligns with the boundaries +between categories. + +
+
+ comment: 7 pages, 2 figures, Accepted (Oral) to InfoCog@NeurIPS 2023 +
+
+
+
+
+ + ☆ Accelerating Hierarchical Associative Memory: A Deep Equilibrium + Approach NeurIPS + + +
+ Hierarchical Associative Memory models have recently been proposed as a +versatile extension of continuous Hopfield networks. In order to facilitate +future research on such models, especially at scale, we focus on increasing +their simulation efficiency on digital hardware. In particular, we propose two +strategies to speed up memory retrieval in these models, which corresponds to +their use at inference, but is equally important during training. First, we +show how they can be cast as Deep Equilibrium Models, which allows using faster +and more stable solvers. Second, inspired by earlier work, we show that +alternating optimization of the even and odd layers accelerates memory +retrieval by a factor close to two. Combined, these two techniques allow for a +much faster energy minimization, as shown in our proof-of-concept experimental +results. The code is available at https://github.com/cgoemaere/hamdeq + +
+
+ comment: Accepted at the "Associative Memory & Hopfield Networks'' workshop at + NeurIPS, 2023 +
+
+
+
+
+ + ☆ Regularization by Texts for Latent Diffusion Inverse Solvers + + +
+ The recent advent of diffusion models has led to significant progress in +solving inverse problems, leveraging these models as effective generative +priors. Nonetheless, challenges related to the ill-posed nature of such +problems remain, often due to inherent ambiguities in measurements. Drawing +inspiration from the human ability to resolve visual ambiguities through +perceptual biases, here we introduce a novel latent diffusion inverse solver by +incorporating regularization by texts (TReg). Specifically, TReg applies the +textual description of the preconception of the solution during the reverse +sampling phase, of which description isndynamically reinforced through +null-text optimization for adaptive negation. Our comprehensive experimental +results demonstrate that TReg successfully mitigates ambiguity in latent +diffusion inverse solvers, enhancing their effectiveness and accuracy. + +
+
+
+
+
+ + ☆ Universal Event Detection in Time Series + + +
+ In our previously published work, we introduced a supervised deep learning +method for event detection in multivariate time series data, employing +regression instead of binary classification. This simplification avoids the +need for point-wise labels throughout the entire dataset, relying solely on +ground truth events defined as time points or intervals. In this paper, we +establish mathematically that our method is universal, and capable of detecting +any type of event with arbitrary precision under mild continuity assumptions on +the time series. These events may encompass change points, frauds, anomalies, +physical occurrences, and more. We substantiate our theoretical results using +the universal approximation theorem for feed-forward neural networks (FFN). +Additionally, we provide empirical validations that confirm our claims, +demonstrating that our method, with a limited number of parameters, outperforms +other deep learning approaches, particularly for rare events and imbalanced +datasets from different domains. + +
+
+ comment: To be submitted to IEEE Transactions on Neural Networks and Learning + Systems +
+
+
+
+
+ + ☆ RoboGPT: an intelligent agent of making embodied long-term decisions for + daily instruction tasks + + +
+ Robotic agents must master common sense and long-term sequential decisions to +solve daily tasks through natural language instruction. The developments in +Large Language Models (LLMs) in natural language processing have inspired +efforts to use LLMs in complex robot planning. Despite LLMs' great +generalization and comprehension of instruction tasks, LLMs-generated task +plans sometimes lack feasibility and correctness. To address the problem, we +propose a RoboGPT agent\footnote{our code and dataset will be released soon} +for making embodied long-term decisions for daily tasks, with two modules: 1) +LLMs-based planning with re-plan to break the task into multiple sub-goals; 2) +RoboSkill individually designed for sub-goals to learn better navigation and +manipulation skills. The LLMs-based planning is enhanced with a new robotic +dataset and re-plan, called RoboGPT. The new robotic dataset of 67k daily +instruction tasks is gathered for fine-tuning the Llama model and obtaining +RoboGPT. RoboGPT planner with strong generalization can plan hundreds of daily +instruction tasks. Additionally, a low-computational Re-Plan module is designed +to allow plans to flexibly adapt to the environment, thereby addressing the +nomenclature diversity challenge. The proposed RoboGPT agent outperforms SOTA +methods on the ALFRED daily tasks. Moreover, RoboGPT planner exceeds SOTA +LLM-based planners like ChatGPT in task-planning rationality for hundreds of +unseen daily tasks, and even other domain tasks, while keeping the large +model's original broad application and generality. + +
+
+
+
+
+ + ☆ Reinforcement Learning from Diffusion Feedback: Q* for Image Search + + +
+ Large vision-language models are steadily gaining personalization +capabilities at the cost of fine-tuning or data augmentation. We present two +models for image generation using model-agnostic learning that align semantic +priors with generative capabilities. RLDF, or Reinforcement Learning from +Diffusion Feedback, is a singular approach for visual imitation through +prior-preserving reward function guidance. This employs Q-learning (with +standard Q*) for generation and follows a semantic-rewarded trajectory for +image search through finite encoding-tailored actions. The second proposed +method, noisy diffusion gradient, is optimization driven. At the root of both +methods is a special CFG encoding that we propose for continual semantic +guidance. Using only a single input image and no text input, RLDF generates +high-quality images over varied domains including retail, sports and +agriculture showcasing class-consistency and strong visual diversity. Project +website is available at https://infernolia.github.io/RLDF. + +
+
+
+
+
+ + ☆ Bandits Meet Mechanism Design to Combat Clickbait in Online + Recommendation + + +
+ We study a strategic variant of the multi-armed bandit problem, which we coin +the strategic click-bandit. This model is motivated by applications in online +recommendation where the choice of recommended items depends on both the +click-through rates and the post-click rewards. Like in classical bandits, +rewards follow a fixed unknown distribution. However, we assume that the +click-rate of each arm is chosen strategically by the arm (e.g., a host on +Airbnb) in order to maximize the number of times it gets clicked. The algorithm +designer does not know the post-click rewards nor the arms' actions (i.e., +strategically chosen click-rates) in advance, and must learn both values over +time. To solve this problem, we design an incentive-aware learning algorithm, +UCB-S, which achieves two goals simultaneously: (a) incentivizing desirable arm +behavior under uncertainty; (b) minimizing regret by learning unknown +parameters. We characterize all approximate Nash equilibria among arms under +UCB-S and show a $\tilde{\mathcal{O}} (\sqrt{KT})$ regret bound uniformly in +every equilibrium. We also show that incentive-unaware algorithms generally +fail to achieve low regret in the strategic click-bandit. Finally, we support +our theoretical results by simulations of strategic arm behavior which confirm +the effectiveness and robustness of our proposed incentive design. + +
+
+
+
+
+ + ☆ Injecting linguistic knowledge into BERT for Dialogue State Tracking + + +
+ Dialogue State Tracking (DST) models often employ intricate neural network +architectures, necessitating substantial training data, and their inference +processes lack transparency. This paper proposes a method that extracts +linguistic knowledge via an unsupervised framework and subsequently utilizes +this knowledge to augment BERT's performance and interpretability in DST tasks. +The knowledge extraction procedure is computationally economical and does not +necessitate annotations or additional training data. The injection of the +extracted knowledge necessitates the addition of only simple neural modules. We +employ the Convex Polytopic Model (CPM) as a feature extraction tool for DST +tasks and illustrate that the acquired features correlate with the syntactic +and semantic patterns in the dialogues. This correlation facilitates a +comprehensive understanding of the linguistic features influencing the DST +model's decision-making process. We benchmark this framework on various DST +tasks and observe a notable improvement in accuracy. + +
+
+
+
+
+ + ☆ VeryFL: A Verify Federated Learning Framework Embedded with Blockchain + + +
+ Blockchain-empowered federated learning (FL) has provoked extensive research +recently. Various blockchain-based federated learning algorithm, architecture +and mechanism have been designed to solve issues like single point failure and +data falsification brought by centralized FL paradigm. Moreover, it is easier +to allocate incentives to nodes with the help of the blockchain. Various +centralized federated learning frameworks like FedML, have emerged in the +community to help boost the research on FL. However, decentralized +blockchain-based federated learning framework is still missing, which cause +inconvenience for researcher to reproduce or verify the algorithm performance +based on blockchain. Inspired by the above issues, we have designed and +developed a blockchain-based federated learning framework by embedding Ethereum +network. This report will present the overall structure of this framework, +which proposes a code practice paradigm for the combination of FL with +blockchain and, at the same time, compatible with normal FL training task. In +addition to implement some blockchain federated learning algorithms on smart +contract to help execute a FL training, we also propose a model ownership +authentication architecture based on blockchain and model watermarking to +protect the intellectual property rights of models. These mechanism on +blockchain shows an underlying support of blockchain for federated learning to +provide a verifiable training, aggregation and incentive distribution procedure +and thus we named this framework VeryFL (A Verify Federated Learninig Framework +Embedded with Blockchain). The source code is avaliable on +https://github.com/GTMLLab/VeryFL. + +
+
+
+
+
+ + ☆ Bayesian Approach to Linear Bayesian Networks + + +
+ This study proposes the first Bayesian approach for learning high-dimensional +linear Bayesian networks. The proposed approach iteratively estimates each +element of the topological ordering from backward and its parent using the +inverse of a partial covariance matrix. The proposed method successfully +recovers the underlying structure when Bayesian regularization for the inverse +covariance matrix with unequal shrinkage is applied. Specifically, it shows +that the number of samples $n = \Omega( d_M^2 \log p)$ and $n = \Omega(d_M^2 +p^{2/m})$ are sufficient for the proposed algorithm to learn linear Bayesian +networks with sub-Gaussian and 4m-th bounded-moment error distributions, +respectively, where $p$ is the number of nodes and $d_M$ is the maximum degree +of the moralized graph. The theoretical findings are supported by extensive +simulation studies including real data analysis. Furthermore the proposed +method is demonstrated to outperform state-of-the-art frequentist approaches, +such as the BHLSM, LISTEN, and TD algorithms in synthetic data. + +
+
+
+
+
+ + ☆ A manometric feature descriptor with linear-SVM to distinguish + esophageal contraction vigor + + +
+ n clinical, if a patient presents with nonmechanical obstructive dysphagia, +esophageal chest pain, and gastro esophageal reflux symptoms, the physician +will usually assess the esophageal dynamic function. High-resolution manometry +(HRM) is a clinically commonly used technique for detection of esophageal +dynamic function comprehensively and objectively. However, after the results of +HRM are obtained, doctors still need to evaluate by a variety of parameters. +This work is burdensome, and the process is complex. We conducted image +processing of HRM to predict the esophageal contraction vigor for assisting the +evaluation of esophageal dynamic function. Firstly, we used Feature-Extraction +and Histogram of Gradients (FE-HOG) to analyses feature of proposal of swallow +(PoS) to further extract higher-order features. Then we determine the +classification of esophageal contraction vigor normal, weak and failed by using +linear-SVM according to these features. Our data set includes 3000 training +sets, 500 validation sets and 411 test sets. After verification our accuracy +reaches 86.83%, which is higher than other common machine learning methods. + +
+
+
+
+
+ + ☆ QuickDrop: Efficient Federated Unlearning by Integrated Dataset + Distillation + + +
+ Federated Unlearning (FU) aims to delete specific training data from an ML +model trained using Federated Learning (FL). We introduce QuickDrop, an +efficient and original FU method that utilizes dataset distillation (DD) to +accelerate unlearning and drastically reduces computational overhead compared +to existing approaches. In QuickDrop, each client uses DD to generate a compact +dataset representative of the original training dataset, called a distilled +dataset, and uses this compact dataset during unlearning. To unlearn specific +knowledge from the global model, QuickDrop has clients execute Stochastic +Gradient Ascent with samples from the distilled datasets, thus significantly +reducing computational overhead compared to conventional FU methods. We further +increase the efficiency of QuickDrop by ingeniously integrating DD into the FL +training process. By reusing the gradient updates produced during FL training +for DD, the overhead of creating distilled datasets becomes close to +negligible. Evaluations on three standard datasets show that, with comparable +accuracy guarantees, QuickDrop reduces the duration of unlearning by 463.8x +compared to model retraining from scratch and 65.1x compared to existing FU +approaches. We also demonstrate the scalability of QuickDrop with 100 clients +and show its effectiveness while handling multiple unlearning operations. + +
+
+
+
+
+ + ☆ Optimal Clustering of Discrete Mixtures: Binomial, Poisson, Block + Models, and Multi-layer Networks + + +
+ In this paper, we first study the fundamental limit of clustering networks +when a multi-layer network is present. Under the mixture multi-layer stochastic +block model (MMSBM), we show that the minimax optimal network clustering error +rate, which takes an exponential form and is characterized by the Renyi +divergence between the edge probability distributions of the component +networks. We propose a novel two-stage network clustering method including a +tensor-based initialization algorithm involving both node and sample splitting +and a refinement procedure by likelihood-based Lloyd algorithm. Network +clustering must be accompanied by node community detection. Our proposed +algorithm achieves the minimax optimal network clustering error rate and allows +extreme network sparsity under MMSBM. Numerical simulations and real data +experiments both validate that our method outperforms existing methods. +Oftentimes, the edges of networks carry count-type weights. We then extend our +methodology and analysis framework to study the minimax optimal clustering +error rate for mixture of discrete distributions including Binomial, Poisson, +and multi-layer Poisson networks. The minimax optimal clustering error rates in +these discrete mixtures all take the same exponential form characterized by the +Renyi divergences. These optimal clustering error rates in discrete mixtures +can also be achieved by our proposed two-stage clustering algorithm. + +
+
+
+
+
+ + ☆ UniRepLKNet: A Universal Perception Large-Kernel ConvNet for Audio, + Video, Point Cloud, Time-Series and Image Recognition + + +
+ Large-kernel convolutional neural networks (ConvNets) have recently received +extensive research attention, but there are two unresolved and critical issues +that demand further investigation. 1) The architectures of existing +large-kernel ConvNets largely follow the design principles of conventional +ConvNets or transformers, while the architectural design for large-kernel +ConvNets remains under-addressed. 2) As transformers have dominated multiple +modalities, it remains to be investigated whether ConvNets also have a strong +universal perception ability in domains beyond vision. In this paper, we +contribute from two aspects. 1) We propose four architectural guidelines for +designing large-kernel ConvNets, the core of which is to exploit the essential +characteristics of large kernels that distinguish them from small kernels - +they can see wide without going deep. Following such guidelines, our proposed +large-kernel ConvNet shows leading performance in image recognition. For +example, our models achieve an ImageNet accuracy of 88.0%, ADE20K mIoU of +55.6%, and COCO box AP of 56.4%, demonstrating better performance and higher +speed than a number of recently proposed powerful competitors. 2) We discover +that large kernels are the key to unlocking the exceptional performance of +ConvNets in domains where they were originally not proficient. With certain +modality-related preprocessing approaches, the proposed model achieves +state-of-the-art performance on time-series forecasting and audio recognition +tasks even without modality-specific customization to the architecture. Code +and all the models at https://github.com/AILab-CVC/UniRepLKNet. + +
+
+ comment: Code, all the models and reproducible training scripts at + https://github.com/AILab-CVC/UniRepLKNet +
+
+
+
+
+ + ☆ Quantum Langevin Dynamics for Optimization + + +
+ We initiate the study of utilizing Quantum Langevin Dynamics (QLD) to solve +optimization problems, particularly those non-convex objective functions that +present substantial obstacles for traditional gradient descent algorithms. +Specifically, we examine the dynamics of a system coupled with an infinite heat +bath. This interaction induces both random quantum noise and a deterministic +damping effect to the system, which nudge the system towards a steady state +that hovers near the global minimum of objective functions. We theoretically +prove the convergence of QLD in convex landscapes, demonstrating that the +average energy of the system can approach zero in the low temperature limit +with an exponential decay rate correlated with the evolution time. Numerically, +we first show the energy dissipation capability of QLD by retracing its origins +to spontaneous emission. Furthermore, we conduct detailed discussion of the +impact of each parameter. Finally, based on the observations when comparing QLD +with classical Fokker-Plank-Smoluchowski equation, we propose a time-dependent +QLD by making temperature and $\hbar$ time-dependent parameters, which can be +theoretically proven to converge better than the time-independent case and also +outperforms a series of state-of-the-art quantum and classical optimization +algorithms in many non-convex landscapes. + +
+
+ comment: 33 pages, 1 table, 26 figures +
+
+
+
+
+ + ☆ A deep learning approach for marine snow synthesis and removal + + +
+ Marine snow, the floating particles in underwater images, severely degrades +the visibility and performance of human and machine vision systems. This paper +proposes a novel method to reduce the marine snow interference using deep +learning techniques. We first synthesize realistic marine snow samples by +training a Generative Adversarial Network (GAN) model and combine them with +natural underwater images to create a paired dataset. We then train a U-Net +model to perform marine snow removal as an image to image translation task. Our +experiments show that the U-Net model can effectively remove both synthetic and +natural marine snow with high accuracy, outperforming state-of-the-art methods +such as the Median filter and its adaptive variant. We also demonstrate the +robustness of our method by testing it on the MSRB dataset, which contains +synthetic artifacts that our model has not seen during training. Our method is +a practical and efficient solution for enhancing underwater images affected by +marine snow. + +
+
+
+
+
+ + ☆ A Simple Geometric-Aware Indoor Positioning Interpolation Algorithm + Based on Manifold Learning + + +
+ Interpolation methodologies have been widely used within the domain of indoor +positioning systems. However, existing indoor positioning interpolation +algorithms exhibit several inherent limitations, including reliance on complex +mathematical models, limited flexibility, and relatively low precision. To +enhance the accuracy and efficiency of indoor positioning interpolation +techniques, this paper proposes a simple yet powerful geometric-aware +interpolation algorithm for indoor positioning tasks. The key to our algorithm +is to exploit the geometric attributes of the local topological manifold using +manifold learning principles. Therefore, instead of constructing complicated +mathematical models, the proposed algorithm facilitates the more precise and +efficient estimation of points grounded in the local topological manifold. +Moreover, our proposed method can be effortlessly integrated into any indoor +positioning system, thereby bolstering its adaptability. Through a systematic +array of experiments and comprehensive performance analyses conducted on both +simulated and real-world datasets, we demonstrate that the proposed algorithm +consistently outperforms the most commonly used and representative +interpolation approaches regarding interpolation accuracy and efficiency. +Furthermore, the experimental results also underscore the substantial practical +utility of our method and its potential applicability in real-time indoor +positioning scenarios. + +
+
+
+
+
+ + ☆ Lightly Weighted Automatic Audio Parameter Extraction for the Quality + Assessment of Consensus Auditory-Perceptual Evaluation of Voice + + +
+ The Consensus Auditory-Perceptual Evaluation of Voice is a widely employed +tool in clinical voice quality assessment that is significant for streaming +communication among clinical professionals and benchmarking for the +determination of further treatment. Currently, because the assessment relies on +experienced clinicians, it tends to be inconsistent, and thus, difficult to +standardize. To address this problem, we propose to leverage lightly weighted +automatic audio parameter extraction, to increase the clinical relevance, +reduce the complexity, and enhance the interpretability of voice quality +assessment. The proposed method utilizes age, sex, and five audio parameters: +jitter, absolute jitter, shimmer, harmonic-to-noise ratio (HNR), and zero +crossing. A classical machine learning approach is employed. The result reveals +that our approach performs similar to state-of-the-art (SOTA) methods, and +outperforms the latent representation obtained by using popular audio +pre-trained models. This approach provide insights into the feasibility of +different feature extraction approaches for voice evaluation. Audio parameters +such as jitter and the HNR are proven to be suitable for characterizing voice +quality attributes, such as roughness and strain. Conversely, pre-trained +models exhibit limitations in effectively addressing noise-related scorings. +This study contributes toward more comprehensive and precise voice quality +evaluations, achieved by a comprehensively exploring diverse assessment +methodologies. + +
+
+ comment: Published in IEEE 42th International Conference on Consumer + Electronics (ICCE 2024) +
+
+
+
+
+ + ☆ Experimental Analysis of Large-scale Learnable Vector Storage + Compression + + +
+ Learnable embedding vector is one of the most important applications in +machine learning, and is widely used in various database-related domains. +However, the high dimensionality of sparse data in recommendation tasks and the +huge volume of corpus in retrieval-related tasks lead to a large memory +consumption of the embedding table, which poses a great challenge to the +training and deployment of models. Recent research has proposed various methods +to compress the embeddings at the cost of a slight decrease in model quality or +the introduction of other overheads. Nevertheless, the relative performance of +these methods remains unclear. Existing experimental comparisons only cover a +subset of these methods and focus on limited metrics. In this paper, we perform +a comprehensive comparative analysis and experimental evaluation of embedding +compression. We introduce a new taxonomy that categorizes these techniques +based on their characteristics and methodologies, and further develop a modular +benchmarking framework that integrates 14 representative methods. Under a +uniform test environment, our benchmark fairly evaluates each approach, +presents their strengths and weaknesses under different memory budgets, and +recommends the best method based on the use case. In addition to providing +useful guidelines, our study also uncovers the limitations of current methods +and suggests potential directions for future research. + +
+
+
+
+
+ + ☆ UFDA: Universal Federated Domain Adaptation with Practical Assumptions AAAI2024 + + +
+ Conventional Federated Domain Adaptation (FDA) approaches usually demand an +abundance of assumptions, such as label set consistency, which makes them +significantly less feasible for real-world situations and introduces security +hazards. In this work, we propose a more practical scenario named Universal +Federated Domain Adaptation (UFDA). It only requires the black-box model and +the label set information of each source domain, while the label sets of +different source domains could be inconsistent and the target-domain label set +is totally blind. This relaxes the assumptions made by FDA, which are often +challenging to meet in real-world cases and diminish model security. To address +the UFDA scenario, we propose a corresponding framework called Hot-Learning +with Contrastive Label Disambiguation (HCLD), which tackles UFDA's domain +shifts and category gaps problem by using one-hot outputs from the black-box +models of various source domains. Moreover, to better distinguish the shared +and unknown classes, we further present a cluster-level strategy named +Mutual-Voting Decision (MVD) to extract robust consensus knowledge across peer +classes from both source and target domains. The extensive experiments on three +benchmarks demonstrate that our HCLD achieves comparable performance for our +UFDA scenario with much fewer assumptions, compared to the previous +methodologies with many additional assumptions. + +
+
+ comment: Submitted to AAAI2024 +
+
+
+
+
+ + ☆ SpotServe: Serving Generative Large Language Models on Preemptible + Instances ASPLOS 2024 + + +
+ The high computational and memory requirements of generative large language +models (LLMs) make it challenging to serve them cheaply. This paper aims to +reduce the monetary cost for serving LLMs by leveraging preemptible GPU +instances on modern clouds, which offer accesses to spare GPUs at a much +cheaper price than regular instances but may be preempted by the cloud at any +time. Serving LLMs on preemptible instances requires addressing challenges +induced by frequent instance preemptions and the necessity of migrating +instances to handle these preemptions. + This paper presents SpotServe, the first distributed LLM serving system on +preemptible instances. Several key techniques in SpotServe realize fast and +reliable serving of generative LLMs on cheap preemptible instances. First, +SpotServe dynamically adapts the LLM parallelization configuration for dynamic +instance availability and fluctuating workload, while balancing the trade-off +among the overall throughput, inference latency and monetary costs. Second, to +minimize the cost of migrating instances for dynamic reparallelization, the +task of migrating instances is formulated as a bipartite graph matching +problem, which uses the Kuhn-Munkres algorithm to identify an optimal migration +plan that minimizes communications. Finally, to take advantage of the grace +period offered by modern clouds, we introduce stateful inference recovery, a +new inference mechanism that commits inference progress at a much finer +granularity and allows SpotServe to cheaply resume inference upon preemption. +We evaluate on real spot instance preemption traces and various popular LLMs +and show that SpotServe can reduce the P99 tail latency by 2.4 - 9.1x compared +with the best existing LLM serving systems. We also show that SpotServe can +leverage the price advantage of preemptive instances, saving 54% monetary cost +compared with only using on-demand instances. + +
+
+ comment: ASPLOS 2024 +
+
+
+
+
+ + ☆ Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing + AI-Generated Text + + +
+ My research investigates the use of cutting-edge hybrid deep learning models +to accurately differentiate between AI-generated text and human writing. I +applied a robust methodology, utilising a carefully selected dataset comprising +AI and human texts from various sources, each tagged with instructions. +Advanced natural language processing techniques facilitated the analysis of +textual features. Combining sophisticated neural networks, the custom model +enabled it to detect nuanced differences between AI and human content. + +
+
+
+
+
+ + ☆ Instruct2Attack: Language-Guided Semantic Adversarial Attacks + + +
+ We propose Instruct2Attack (I2A), a language-guided semantic attack that +generates semantically meaningful perturbations according to free-form language +instructions. We make use of state-of-the-art latent diffusion models, where we +adversarially guide the reverse diffusion process to search for an adversarial +latent code conditioned on the input image and text instruction. Compared to +existing noise-based and semantic attacks, I2A generates more natural and +diverse adversarial examples while providing better controllability and +interpretability. We further automate the attack process with GPT-4 to generate +diverse image-specific text instructions. We show that I2A can successfully +break state-of-the-art deep neural networks even under strong adversarial +defenses, and demonstrate great transferability among a variety of network +architectures. + +
+
+ comment: under submission, code coming soon +
+
+
+
+
+ + ☆ From Prediction to Action: The Critical Role of Proper Performance + Estimation for Machine-Learning-Driven Materials Discovery + + +
+ Materials discovery driven by statistical property models is an iterative +decision process, during which an initial data collection is extended with new +data proposed by a model-informed acquisition function--with the goal to +maximize a certain "reward" over time, such as the maximum property value +discovered so far. While the materials science community achieved much progress +in developing property models that predict well on average with respect to the +training distribution, this form of in-distribution performance measurement is +not directly coupled with the discovery reward. This is because an iterative +discovery process has a shifting reward distribution that is +over-proportionally determined by the model performance for exceptional +materials. We demonstrate this problem using the example of bulk modulus +maximization among double perovskite oxides. We find that the in-distribution +predictive performance suggests random forests as superior to Gaussian process +regression, while the results are inverse in terms of the discovery rewards. We +argue that the lack of proper performance estimation methods from pre-computed +data collections is a fundamental problem for improving data-driven materials +discovery, and we propose a novel such estimator that, in contrast to na\"ive +reward estimation, successfully predicts Gaussian processes with the "expected +improvement" acquisition function as the best out of four options in our +demonstrational study for double perovskites. Importantly, it does so without +requiring the over thousand ab initio computations that were needed to confirm +this prediction. + +
+
+
+
+
+ + ☆ Deficiency of Large Language Models in Finance: An Empirical Examination + of Hallucination + + +
+ The hallucination issue is recognized as a fundamental deficiency of large +language models (LLMs), especially when applied to fields such as finance, +education, and law. Despite the growing concerns, there has been a lack of +empirical investigation. In this paper, we provide an empirical examination of +LLMs' hallucination behaviors in financial tasks. First, we empirically +investigate LLM model's ability of explaining financial concepts and +terminologies. Second, we assess LLM models' capacity of querying historical +stock prices. Third, to alleviate the hallucination issue, we evaluate the +efficacy of four practical methods, including few-shot learning, Decoding by +Contrasting Layers (DoLa), the Retrieval Augmentation Generation (RAG) method +and the prompt-based tool learning method for a function to generate a query +command. Finally, our major finding is that off-the-shelf LLMs experience +serious hallucination behaviors in financial tasks. Therefore, there is an +urgent need to call for research efforts in mitigating LLMs' hallucination. + +
+
+
+
+
+ + ☆ Dataset Distillation in Latent Space + + +
+ Dataset distillation (DD) is a newly emerging research area aiming at +alleviating the heavy computational load in training models on large datasets. +It tries to distill a large dataset into a small and condensed one so that +models trained on the distilled dataset can perform comparably with those +trained on the full dataset when performing downstream tasks. Among the +previous works in this area, there are three key problems that hinder the +performance and availability of the existing DD methods: high time complexity, +high space complexity, and low info-compactness. In this work, we +simultaneously attempt to settle these three problems by moving the DD +processes from conventionally used pixel space to latent space. Encoded by a +pretrained generic autoencoder, latent codes in the latent space are naturally +info-compact representations of the original images in much smaller sizes. +After transferring three mainstream DD algorithms to latent space, we +significantly reduce time and space consumption while achieving similar +performance, allowing us to distill high-resolution datasets or target at +greater data ratio that previous methods have failed. Besides, within the same +storage budget, we can also quantitatively deliver more latent codes than +pixel-level images, which further boosts the performance of our methods. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Out-of-Distribution Generalized Dynamic Graph Neural Network for Human + Albumin Prediction + + +
+ Human albumin is essential for indicating the body's overall health. +Accurately predicting plasma albumin levels and determining appropriate doses +are urgent clinical challenges, particularly in critically ill patients, to +maintain optimal blood levels. However, human albumin prediction is non-trivial +that has to leverage the dynamics of biochemical markers as well as the +experience of treating patients. Moreover, the problem of distribution shift is +often encountered in real clinical data, which may lead to a decline in the +model prediction performance and reduce the reliability of the model's +application. In this paper, we propose a framework named Out-of-Distribution +Generalized Dynamic Graph Neural Network for Human Albumin Prediction +(DyG-HAP), which is able to provide accurate albumin predictions for Intensity +Care Unit (ICU) patients during hospitalization. We first model human albumin +prediction as a dynamic graph regression problem to model the dynamics and +patient relationship. Then, we propose a disentangled dynamic graph attention +mechanism to capture and disentangle the patterns whose relationship to labels +under distribution shifts is invariant and variant respectively. Last, we +propose an invariant dynamic graph regression method to encourage the model to +rely on invariant patterns to make predictions. Moreover, we propose a dataset +named Albumin level testing and nutritional dosing data for Intensive Care +(ANIC) for evaluation. Extensive experiments demonstrate the superiority of our +method compared to several baseline methods in human albumin prediction. + +
+
+ comment: MedAI'23 +
+
+
+
+
+ + ☆ SVRDA: A Web-based Dataset Annotation Tool for Slice-to-Volume + Registration + + +
+ Background and Objective: The lack of benchmark datasets has impeded the +development of slice-to-volume registration algorithms. Such datasets are +difficult to annotate, primarily due to the dimensional difference within data +and the dearth of task-specific software. We aim to develop a user-friendly +tool to streamline dataset annotation for slice-to-volume registration. + Methods: The proposed tool, named SVRDA, is an installation-free web +application for platform-agnostic collaborative dataset annotation. It enables +efficient transformation manipulation via keyboard shortcuts and smooth case +transitions with auto-saving. SVRDA supports configuration-based data loading +and adheres to the separation of concerns, offering great flexibility and +extensibility for future research. Various supplementary features have been +implemented to facilitate slice-to-volume registration. + Results: We validated the effectiveness of SVRDA by indirectly evaluating the +post-registration segmentation quality on UK Biobank data, observing a dramatic +overall improvement (24.02% in the Dice Similarity Coefficient and 48.93% in +the 95th percentile Hausdorff distance, respectively) supported by highly +statistically significant evidence ($p<0.001$).We further showcased the +clinical usage of SVRDA by integrating it into test-retest T1 quantification on +in-house magnetic resonance images, leading to more consistent results after +registration. + Conclusions: SVRDA can facilitate collaborative annotation of benchmark +datasets while being potentially applicable to other pipelines incorporating +slice-to-volume registration. Full source code and documentation are available +at https://github.com/Roldbach/SVRDA + +
+
+ comment: 18 pages, 11 figures, In submission to Computer Methods and Programs + in Biomedicine +
+
+
+
+
+ + ☆ SSIN: Self-Supervised Learning for Rainfall Spatial Interpolation SIGMOD 2023 + + +
+ The acquisition of accurate rainfall distribution in space is an important +task in hydrological analysis and natural disaster pre-warning. However, it is +impossible to install rain gauges on every corner. Spatial interpolation is a +common way to infer rainfall distribution based on available raingauge data. +However, the existing works rely on some unrealistic pre-settings to capture +spatial correlations, which limits their performance in real scenarios. To +tackle this issue, we propose the SSIN, which is a novel data-driven +self-supervised learning framework for rainfall spatial interpolation by mining +latent spatial patterns from historical observation data. Inspired by the Cloze +task and BERT, we fully consider the characteristics of spatial interpolation +and design the SpaFormer model based on the Transformer architecture as the +core of SSIN. Our main idea is: by constructing rich self-supervision signals +via random masking, SpaFormer can learn informative embeddings for raw data and +then adaptively model spatial correlations based on rainfall spatial context. +Extensive experiments on two real-world raingauge datasets show that our method +outperforms the state-of-the-art solutions. In addition, we take traffic +spatial interpolation as another use case to further explore the performance of +our method, and SpaFormer achieves the best performance on one large real-world +traffic dataset, which further confirms the effectiveness and generality of our +method. + +
+
+ comment: SIGMOD 2023 Data-intensive Applications (DIA) Track; Code is + available at https://github.com/jlidw/SSIN +
+
+
+
+
+ + ☆ Active Foundational Models for Fault Diagnosis of Electrical Motors + + +
+ Fault detection and diagnosis of electrical motors are of utmost importance +in ensuring the safe and reliable operation of several industrial systems. +Detection and diagnosis of faults at the incipient stage allows corrective +actions to be taken in order to reduce the severity of faults. The existing +data-driven deep learning approaches for machine fault diagnosis rely +extensively on huge amounts of labeled samples, where annotations are expensive +and time-consuming. However, a major portion of unlabeled condition monitoring +data is not exploited in the training process. To overcome this limitation, we +propose a foundational model-based Active Learning framework that utilizes less +amount of labeled samples, which are most informative and harnesses a large +amount of available unlabeled data by effectively combining Active Learning and +Contrastive Self-Supervised Learning techniques. It consists of a transformer +network-based backbone model trained using an advanced nearest-neighbor +contrastive self-supervised learning method. This approach empowers the +backbone to learn improved representations of samples derived from raw, +unlabeled vibration data. Subsequently, the backbone can undergo fine-tuning to +address a range of downstream tasks, both within the same machines and across +different machines. The effectiveness of the proposed methodology has been +assessed through the fine-tuning of the backbone for multiple target tasks +using three distinct machine-bearing fault datasets. The experimental +evaluation demonstrates a superior performance as compared to existing +state-of-the-art fault diagnosis methods with less amount of labeled data. + +
+
+ comment: 30 pages, 2 figures, 7 tables +
+
+
+
+
+ + ☆ A Comparative and Experimental Study on Automatic Question Answering + Systems and its Robustness against Word Jumbling + + +
+ Question answer generation using Natural Language Processing models is +ubiquitous in the world around us. It is used in many use cases such as the +building of chat bots, suggestive prompts in google search and also as a way of +navigating information in banking mobile applications etc. It is highly +relevant because a frequently asked questions (FAQ) list can only have a finite +amount of questions but a model which can perform question answer generation +could be able to answer completely new questions that are within the scope of +the data. This helps us to be able to answer new questions accurately as long +as it is a relevant question. In commercial applications, it can be used to +increase customer satisfaction and ease of usage. However a lot of data is +generated by humans so it is susceptible to human error and this can adversely +affect the model's performance and we are investigating this through our work + +
+
+
+
+
+ + ☆ Learning with Complementary Labels Revisited: A Consistent Approach via + Negative-Unlabeled Learning + + +
+ Complementary-label learning is a weakly supervised learning problem in which +each training example is associated with one or multiple complementary labels +indicating the classes to which it does not belong. Existing consistent +approaches have relied on the uniform distribution assumption to model the +generation of complementary labels, or on an ordinary-label training set to +estimate the transition matrix. However, both conditions may not be satisfied +in real-world scenarios. In this paper, we propose a novel complementary-label +learning approach that does not rely on these conditions. We find that +complementary-label learning can be expressed as a set of negative-unlabeled +binary classification problems when using the one-versus-rest strategy. This +observation allows us to propose a risk-consistent approach with theoretical +guarantees. Furthermore, we introduce a risk correction approach to address +overfitting problems when using complex models. We also prove the statistical +consistency and convergence rate of the corrected risk estimator. Extensive +experimental results on both synthetic and real-world benchmark datasets +validate the superiority of our proposed approach over state-of-the-art +methods. + +
+
+
+
+
+ + ☆ Function-constrained Program Synthesis NeurIPS + + +
+ This work introduces (1) a technique that allows large language models (LLMs) +to leverage user-provided code when solving programming tasks and (2) a method +to iteratively generate modular sub-functions that can aid future code +generation attempts when the initial code generated by the LLM is inadequate. +Generating computer programs in general-purpose programming languages like +Python poses a challenge for LLMs when instructed to use code provided in the +prompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code +completions in real-time by drawing on all code available in a development +environment. However, restricting code-specific LLMs to use only in-context +code is not straightforward, as the model is not explicitly instructed to use +the user-provided code and users cannot highlight precisely which snippets of +code the model should incorporate into its context. Moreover, current systems +lack effective recovery methods, forcing users to iteratively re-prompt the +model with modified prompts until a sufficient solution is reached. Our method +differs from traditional LLM-powered code-generation by constraining +code-generation to an explicit function set and enabling recovery from failed +attempts through automatically generated sub-functions. When the LLM cannot +produce working code, we generate modular sub-functions to aid subsequent +attempts at generating functional code. A by-product of our method is a library +of reusable sub-functions that can solve related tasks, imitating a software +team where efficiency scales with experience. We also introduce a new +"half-shot" evaluation paradigm that provides tighter estimates of LLMs' coding +abilities compared to traditional zero-shot evaluation. Our proposed evaluation +method encourages models to output solutions in a structured format, decreasing +syntax errors that can be mistaken for poor coding ability. + +
+
+ comment: 17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop +
+
+
+
+
+ + ☆ Adaptive Image Registration: A Hybrid Approach Integrating Deep Learning + and Optimization Functions for Enhanced Precision + + +
+ Image registration has traditionally been done using two distinct approaches: +learning based methods, relying on robust deep neural networks, and +optimization-based methods, applying complex mathematical transformations to +warp images accordingly. Of course, both paradigms offer advantages and +disadvantages, and, in this work, we seek to combine their respective strengths +into a single streamlined framework, using the outputs of the learning based +method as initial parameters for optimization while prioritizing computational +power for the image pairs that offer the greatest loss. Our investigations +showed that an improvement of 0.3\% in testing when utilizing the best +performing state-of-the-art model as the backbone of the framework, while +maintaining the same inference time and with only a 0.8\% loss in deformation +field smoothness. + +
+
+
+
+
+ + ☆ Global $\mathcal{L}^2$ minimization with certainty via geometrically + adapted gradient descent in Deep Learning + + +
+ We consider the gradient descent flow widely used for the minimization of the +$\mathcal{L}^2$ cost function in Deep Learning networks, and introduce two +modified versions; one adapted for the overparametrized setting, and the other +for the underparametrized setting. Both have a clear and natural invariant +geometric meaning, taking into account the pullback vector bundle structure in +the overparametrized, and the pushforward vector bundle structure in the +underparametrized setting. In the overparametrized case, we prove that, +provided that a rank condition holds, all orbits of the modified gradient +descent drive the $\mathcal{L}^2$ cost to its global minimum at a uniform +exponential convergence rate. We point out relations of the latter to +sub-Riemannian geometry. + +
+
+ comment: AMS Latex, 12 pages +
+
+
+
+
+ + ♻ ☆ FutureHuman3D: Forecasting Complex Long-Term 3D Human Behavior from + Video Observations + + +
+ We present a generative approach to forecast long-term future human behavior +in 3D, requiring only weak supervision from readily available 2D human action +data. This is a fundamental task enabling many downstream applications. The +required ground-truth data is hard to capture in 3D (mocap suits, expensive +setups) but easy to acquire in 2D (simple RGB cameras). Thus, we design our +method to only require 2D RGB data while being able to generate 3D human motion +sequences. We use a differentiable 2D projection scheme in an autoregressive +manner for weak supervision, and an adversarial loss for 3D regularization. Our +method predicts long and complex behavior sequences (e.g. cooking, assembly) +consisting of multiple sub-actions. We tackle this in a semantically +hierarchical manner, jointly predicting high-level coarse action labels +together with their low-level fine-grained realizations as characteristic 3D +human poses. We observe that these two action representations are coupled in +nature, and joint prediction benefits both action and pose forecasting. Our +experiments demonstrate the complementary nature of joint action and 3D pose +prediction: our joint approach outperforms each task treated individually, +enables robust longer-term sequence prediction, and outperforms alternative +approaches to forecast actions and characteristic 3D poses. + +
+
+ comment: Project Page: https://future-human-3d.christian-diller.de/ Video: + https://www.youtube.com/watch?v=18du85YFXL0 +
+
+
+
+
+ + ♻ ☆ Machine learning-based decentralized TDMA for VLC IoT networks + + +
+ In this paper, a machine learning-based decentralized time division multiple +access (TDMA) algorithm for visible light communication (VLC) Internet of +Things (IoT) networks is proposed. The proposed algorithm is based on +Q-learning, a reinforcement learning algorithm. This paper considers a +decentralized condition in which there is no coordinator node for sending +synchronization frames and assigning transmission time slots to other nodes. +The proposed algorithm uses a decentralized manner for synchronization, and +each node uses the Q-learning algorithm to find the optimal transmission time +slot for sending data without collisions. The proposed algorithm is implemented +on a VLC hardware system, which had been designed and implemented in our +laboratory. Average reward, convergence time, goodput, average delay, and data +packet size are evaluated parameters. The results show that the proposed +algorithm converges quickly and provides collision-free decentralized TDMA for +the network. The proposed algorithm is compared with carrier-sense multiple +access with collision avoidance (CSMA/CA) algorithm as a potential selection +for decentralized VLC IoT networks. The results show that the proposed +algorithm provides up to 61% more goodput and up to 49% less average delay than +CSMA/CA. + +
+
+ comment: This work has been submitted to a journal for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ♻ ☆ Self-Guided Diffusion Models CVPR 2023 + + +
+ Diffusion models have demonstrated remarkable progress in image generation +quality, especially when guidance is used to control the generative process. +However, guidance requires a large amount of image-annotation pairs for +training and is thus dependent on their availability, correctness and +unbiasedness. In this paper, we eliminate the need for such annotation by +instead leveraging the flexibility of self-supervision signals to design a +framework for self-guided diffusion models. By leveraging a feature extraction +function and a self-annotation function, our method provides guidance signals +at various image granularities: from the level of holistic images to object +boxes and even segmentation masks. Our experiments on single-label and +multi-label image datasets demonstrate that self-labeled guidance always +outperforms diffusion models without guidance and may even surpass guidance +based on ground-truth labels, especially on unbalanced data. When equipped with +self-supervised box or mask proposals, our method further generates visually +diverse yet semantically consistent images, without the need for any class, +box, or segment label annotation. Self-guided diffusion is simple, flexible and +expected to profit from deployment at scale. Source code will be at: +https://taohu.me/sgdm/ + +
+
+ comment: CVPR 2023 +
+
+
+
+
+ + ♻ ☆ A deep reinforcement learning model for predictive maintenance planning + of road assets: Integrating LCA and LCCA + + +
+ Road maintenance planning is an integral part of road asset management. One +of the main challenges in Maintenance and Rehabilitation (M&R) practices is to +determine maintenance type and timing. This research proposes a framework using +Reinforcement Learning (RL) based on the Long Term Pavement Performance (LTPP) +database to determine the type and timing of M&R practices. A predictive DNN +model is first developed in the proposed algorithm, which serves as the +Environment for the RL algorithm. For the Policy estimation of the RL model, +both DQN and PPO models are developed. However, PPO has been selected in the +end due to better convergence and higher sample efficiency. Indicators used in +this study are International Roughness Index (IRI) and Rutting Depth (RD). +Initially, we considered Cracking Metric (CM) as the third indicator, but it +was then excluded due to the much fewer data compared to other indicators, +which resulted in lower accuracy of the results. Furthermore, in +cost-effectiveness calculation (reward), we considered both the economic and +environmental impacts of M&R treatments. Costs and environmental impacts have +been evaluated with paLATE 2.0 software. Our method is tested on a hypothetical +case study of a six-lane highway with 23 kilometers length located in Texas, +which has a warm and wet climate. The results propose a 20-year M&R plan in +which road condition remains in an excellent condition range. Because the early +state of the road is at a good level of service, there is no need for heavy +maintenance practices in the first years. Later, after heavy M&R actions, there +are several 1-2 years of no need for treatments. All of these show that the +proposed plan has a logical result. Decision-makers and transportation agencies +can use this scheme to conduct better maintenance practices that can prevent +budget waste and, at the same time, minimize the environmental impacts. + +
+
+
+
+
+ + ♻ ☆ Online Estimation and Optimization of Utility-Based Shortfall Risk + + +
+ Utility-Based Shortfall Risk (UBSR) is a risk metric that is increasingly +popular in financial applications, owing to certain desirable properties that +it enjoys. We consider the problem of estimating UBSR in a recursive setting, +where samples from the underlying loss distribution are available +one-at-a-time. We cast the UBSR estimation problem as a root finding problem, +and propose stochastic approximation-based estimations schemes. We derive +non-asymptotic bounds on the estimation error in the number of samples. We also +consider the problem of UBSR optimization within a parameterized class of +random variables. We propose a stochastic gradient descent based algorithm for +UBSR optimization, and derive non-asymptotic bounds on its convergence. + +
+
+
+
+
+ + ♻ ☆ DeepTSF: Codeless machine learning operations for time series + forecasting + + +
+ This paper presents DeepTSF, a comprehensive machine learning operations +(MLOps) framework aiming to innovate time series forecasting through workflow +automation and codeless modeling. DeepTSF automates key aspects of the ML +lifecycle, making it an ideal tool for data scientists and MLops engineers +engaged in machine learning (ML) and deep learning (DL)-based forecasting. +DeepTSF empowers users with a robust and user-friendly solution, while it is +designed to seamlessly integrate with existing data analysis workflows, +providing enhanced productivity and compatibility. The framework offers a +front-end user interface (UI) suitable for data scientists, as well as other +higher-level stakeholders, enabling comprehensive understanding through +insightful visualizations and evaluation metrics. DeepTSF also prioritizes +security through identity management and access authorization mechanisms. The +application of DeepTSF in real-life use cases of the I-NERGY project has +already proven DeepTSF's efficacy in DL-based load forecasting, showcasing its +significant added value in the electrical power and energy systems domain. + +
+
+
+
+
+ + ♻ ☆ ManiCast: Collaborative Manipulation with Cost-Aware Human Forecasting + + +
+ Seamless human-robot manipulation in close proximity relies on accurate +forecasts of human motion. While there has been significant progress in +learning forecast models at scale, when applied to manipulation tasks, these +models accrue high errors at critical transition points leading to degradation +in downstream planning performance. Our key insight is that instead of +predicting the most likely human motion, it is sufficient to produce forecasts +that capture how future human motion would affect the cost of a robot's plan. +We present ManiCast, a novel framework that learns cost-aware human forecasts +and feeds them to a model predictive control planner to execute collaborative +manipulation tasks. Our framework enables fluid, real-time interactions between +a human and a 7-DoF robot arm across a number of real-world tasks such as +reactive stirring, object handovers, and collaborative table setting. We +evaluate both the motion forecasts and the end-to-end forecaster-planner system +against a range of learned and heuristic baselines while additionally +contributing new datasets. We release our code and datasets at +https://portal-cornell.github.io/manicast/. + +
+
+ comment: CoRL 2023 +
+
+
+
+
+ + ♻ ☆ Low-degree learning and the metric entropy of polynomials + + +
+ Let $\mathscr{F}_{n,d}$ be the class of all functions $f:\{-1,1\}^n\to[-1,1]$ +on the $n$-dimensional discrete hypercube of degree at most $d$. In the first +part of this paper, we prove that any (deterministic or randomized) algorithm +which learns $\mathscr{F}_{n,d}$ with $L_2$-accuracy $\varepsilon$ requires at +least $\Omega((1-\sqrt{\varepsilon})2^d\log n)$ queries for large enough $n$, +thus establishing the sharpness as $n\to\infty$ of a recent upper bound of +Eskenazis and Ivanisvili (2021). To do this, we show that the $L_2$-packing +numbers $\mathsf{M}(\mathscr{F}_{n,d},\|\cdot\|_{L_2},\varepsilon)$ of the +concept class $\mathscr{F}_{n,d}$ satisfy the two-sided estimate +$$c(1-\varepsilon)2^d\log n \leq \log +\mathsf{M}(\mathscr{F}_{n,d},\|\cdot\|_{L_2},\varepsilon) \leq \frac{2^{Cd}\log +n}{\varepsilon^4}$$ for large enough $n$, where $c, C>0$ are universal +constants. In the second part of the paper, we present a logarithmic upper +bound for the randomized query complexity of classes of bounded approximate +polynomials whose Fourier spectra are concentrated on few subsets. As an +application, we prove new estimates for the number of random queries required +to learn approximate juntas of a given degree, functions with rapidly decaying +Fourier tails and constant depth circuits of given size. Finally, we obtain +bounds for the number of queries required to learn the polynomial class +$\mathscr{F}_{n,d}$ without error in the query and random example models. + +
+
+
+
+
+ + ♻ ☆ Deep Calibration of Market Simulations using Neural Density Estimators + and Embedding Networks + + +
+ The ability to construct a realistic simulator of financial exchanges, +including reproducing the dynamics of the limit order book, can give insight +into many counterfactual scenarios, such as a flash crash, a margin call, or +changes in macroeconomic outlook. In recent years, agent-based models have been +developed that reproduce many features of an exchange, as summarised by a set +of stylised facts and statistics. However, the ability to calibrate simulators +to a specific period of trading remains an open challenge. In this work, we +develop a novel approach to the calibration of market simulators by leveraging +recent advances in deep learning, specifically using neural density estimators +and embedding networks. We demonstrate that our approach is able to correctly +identify high probability parameter sets, both when applied to synthetic and +historical data, and without reliance on manually selected or weighted +ensembles of stylised facts. + +
+
+ comment: 4th ACM International Conference on AI in Finance (ICAIF 2023) +
+
+
+
+
+ + ♻ ☆ Optimal Approximation Rates for Deep ReLU Neural Networks on Sobolev and + Besov Spaces + + +
+ Let $\Omega = [0,1]^d$ be the unit cube in $\mathbb{R}^d$. We study the +problem of how efficiently, in terms of the number of parameters, deep neural +networks with the ReLU activation function can approximate functions in the +Sobolev spaces $W^s(L_q(\Omega))$ and Besov spaces $B^s_r(L_q(\Omega))$, with +error measured in the $L_p(\Omega)$ norm. This problem is important when +studying the application of neural networks in a variety of fields, including +scientific computing and signal processing, and has previously been solved only +when $p=q=\infty$. Our contribution is to provide a complete solution for all +$1\leq p,q\leq \infty$ and $s > 0$ for which the corresponding Sobolev or Besov +space compactly embeds into $L_p$. The key technical tool is a novel +bit-extraction technique which gives an optimal encoding of sparse vectors. +This enables us to obtain sharp upper bounds in the non-linear regime where $p +> q$. We also provide a novel method for deriving $L_p$-approximation lower +bounds based upon VC-dimension when $p < \infty$. Our results show that very +deep ReLU networks significantly outperform classical methods of approximation +in terms of the number of parameters, but that this comes at the cost of +parameters which are not encodable. + +
+
+
+
+
+ + ♻ ☆ What If the TV Was Off? Examining Counterfactual Reasoning Abilities of + Multi-modal Language Models + + +
+ Counterfactual reasoning, a fundamental aspect of human cognition, involves +contemplating alternatives to established facts or past events, significantly +enhancing our abilities in planning and decision-making. In light of the +advancements in current multi-modal large language models, we explore their +effectiveness in counterfactual reasoning. To facilitate this investigation, we +introduce a novel dataset, C-VQA, specifically designed to test the +counterfactual reasoning capabilities of modern multi-modal large language +models. This dataset is constructed by infusing original questions with +counterfactual presuppositions, spanning various types such as numerical and +boolean queries. It encompasses a mix of real and synthetic data, representing +a wide range of difficulty levels. Our thorough evaluations of contemporary +vision-language models using this dataset have revealed substantial performance +drops, with some models showing up to a 40\% decrease, highlighting a +significant gap between current models and human-like vision reasoning +capabilities. We hope our dataset will serve as a vital benchmark for +evaluating the counterfactual reasoning capabilities of models. Code and +dataset are publicly available at https://bzhao.me/C-VQA/. + +
+
+
+
+
+ + ♻ ☆ On the Effectiveness of Log Representation for Log-based Anomaly + Detection + + +
+ Logs are an essential source of information for people to understand the +running status of a software system. Due to the evolving modern software +architecture and maintenance methods, more research efforts have been devoted +to automated log analysis. In particular, machine learning (ML) has been widely +used in log analysis tasks. In ML-based log analysis tasks, converting textual +log data into numerical feature vectors is a critical and indispensable step. +However, the impact of using different log representation techniques on the +performance of the downstream models is not clear, which limits researchers and +practitioners' opportunities of choosing the optimal log representation +techniques in their automated log analysis workflows. Therefore, this work +investigates and compares the commonly adopted log representation techniques +from previous log analysis research. Particularly, we select six log +representation techniques and evaluate them with seven ML models and four +public log datasets (i.e., HDFS, BGL, Spirit and Thunderbird) in the context of +log-based anomaly detection. We also examine the impacts of the log parsing +process and the different feature aggregation approaches when they are employed +with log representation techniques. From the experiments, we provide some +heuristic guidelines for future researchers and developers to follow when +designing an automated log analysis workflow. We believe our comprehensive +comparison of log representation techniques can help researchers and +practitioners better understand the characteristics of different log +representation techniques and provide them with guidance for selecting the most +suitable ones for their ML-based log analysis workflow. + +
+
+ comment: Accepted by Journal of Empirical Software Engineering (EMSE) +
+
+
+
+
+ + ♻ ☆ Machine learning and Topological data analysis identify unique features + of human papillae in 3D scans + + +
+ The tongue surface houses a range of papillae that are integral to the +mechanics and chemistry of taste and textural sensation. Although gustatory +function of papillae is well investigated, the uniqueness of papillae within +and across individuals remains elusive. Here, we present the first machine +learning framework on 3D microscopic scans of human papillae (n = 2092), +uncovering the uniqueness of geometric and topological features of papillae. +The finer differences in shapes of papillae are investigated computationally +based on a number of features derived from discrete differential geometry and +computational topology. Interpretable machine learning techniques show that +persistent homology features of the papillae shape are the most effective in +predicting the biological variables. Models trained on these features with +small volumes of data samples predict the type of papillae with an accuracy of +85%. The papillae type classification models can map the spatial arrangement of +filiform and fungiform papillae on a surface. Remarkably, the papillae are +found to be distinctive across individuals and an individual can be identified +with an accuracy of 48% among the 15 participants from a single papillae. +Collectively, this is the first unprecedented evidence demonstrating that +tongue papillae can serve as a unique identifier inspiring new research +direction for food preferences and oral diagnostics. + +
+
+
+
+
+ + ♻ ☆ AST: Effective Dataset Distillation through Alignment with Smooth and + High-Quality Expert Trajectories + + +
+ Training large AI models typically requires large-scale datasets in the +machine learning process, making training and parameter-tuning process both +time-consuming and costly. Some researchers address this problem by carefully +synthesizing a very small number of highly representative and informative +samples from real-world datasets. This approach, known as Dataset Distillation +(DD), proposes a perspective for data-efficient learning. Despite recent +progress in this field, the performance of existing methods still cannot meet +expectations, and distilled datasets cannot effectively replace original +datasets. In this paper, unlike previous methods that focus solely on improving +the effectiveness of student distillation, we recognize and leverage the +important mutual influence between expert and student models. We observed that +the smoothness of expert trajectories has a significant impact on subsequent +student parameter alignment. Based on this, we propose an effective DD +framework named AST, standing for Alignment with Smooth and high-quality expert +Trajectories. We devise the integration of clipping loss and gradient penalty +to regulate the rate of parameter changes in expert trajectory generation. To +further refine the student parameter alignment with expert trajectory, we put +forward representative initialization for the synthetic dataset and balanced +inner-loop loss in response to the sensitivity exhibited towards randomly +initialized variables during distillation. We also propose two enhancement +strategies, namely intermediate matching loss and weight perturbation, to +mitigate the potential occurrence of cumulative errors. We conduct extensive +experiments on datasets of different scales, sizes, and resolutions. The +results demonstrate that the proposed method significantly outperforms prior +methods. + +
+
+
+
+
+ + ♻ ☆ Understanding plasticity in neural networks ICML 2023 + + +
+ Plasticity, the ability of a neural network to quickly change its predictions +in response to new information, is essential for the adaptability and +robustness of deep reinforcement learning systems. Deep neural networks are +known to lose plasticity over the course of training even in relatively simple +learning problems, but the mechanisms driving this phenomenon are still poorly +understood. This paper conducts a systematic empirical analysis into plasticity +loss, with the goal of understanding the phenomenon mechanistically in order to +guide the future development of targeted solutions. We find that loss of +plasticity is deeply connected to changes in the curvature of the loss +landscape, but that it often occurs in the absence of saturated units. Based on +this insight, we identify a number of parameterization and optimization design +choices which enable networks to better preserve plasticity over the course of +training. We validate the utility of these findings on larger-scale RL +benchmarks in the Arcade Learning Environment. + +
+
+ comment: Accepted to ICML 2023 (oral presentation) +
+
+
+
+
+ + ♻ ☆ From Isolated Islands to Pangea: Unifying Semantic Space for Human + Action Understanding + + +
+ As a vital step toward the intelligent agent, Action understanding matters +for intelligent agents and has attracted long-term attention. It can be formed +as the mapping from the action physical space to the semantic space. Typically, +researchers built action datasets according to idiosyncratic choices to define +classes and push the envelope of benchmarks respectively. Thus, datasets are +incompatible with each other like "Isolated Islands" due to semantic gaps and +various class granularities, e.g., do housework in dataset A and wash plate in +dataset B. We argue that a more principled semantic space is an urgent need to +concentrate the community efforts and enable us to use all datasets together to +pursue generalizable action learning. To this end, we design a structured +action semantic space in view of verb taxonomy hierarchy and covering massive +actions. By aligning the classes of previous datasets to our semantic space, we +gather (image/video/skeleton/MoCap) datasets into a unified database in a +unified label system, i.e., bridging ``isolated islands'' into a "Pangea". +Accordingly, we propose a novel model mapping from the physical space to +semantic space to fully use Pangea. In extensive experiments, our new system +shows significant superiority, especially in transfer learning. Code and data +will be made publicly available. + +
+
+ comment: Project Webpage: https://mvig-rhos.com/pangea +
+
+
+
+
+ + ♻ ☆ Bayesian Flow Networks + + +
+ This paper introduces Bayesian Flow Networks (BFNs), a new class of +generative model in which the parameters of a set of independent distributions +are modified with Bayesian inference in the light of noisy data samples, then +passed as input to a neural network that outputs a second, interdependent +distribution. Starting from a simple prior and iteratively updating the two +distributions yields a generative procedure similar to the reverse process of +diffusion models; however it is conceptually simpler in that no forward process +is required. Discrete and continuous-time loss functions are derived for +continuous, discretised and discrete data, along with sample generation +procedures. Notably, the network inputs for discrete data lie on the +probability simplex, and are therefore natively differentiable, paving the way +for gradient-based sample guidance and few-step generation in discrete domains +such as language modelling. The loss function directly optimises data +compression and places no restrictions on the network architecture. In our +experiments BFNs achieve competitive log-likelihoods for image modelling on +dynamically binarized MNIST and CIFAR-10, and outperform all known discrete +diffusion models on the text8 character-level language modelling task. + +
+
+
+
+
+ + ♻ ☆ Asymptotic Bounds for Smoothness Parameter Estimates in Gaussian Process + Interpolation + + +
+ It is common to model a deterministic response function, such as the output +of a computer experiment, as a Gaussian process with a Mat\'ern covariance +kernel. The smoothness parameter of a Mat\'ern kernel determines many important +properties of the model in the large data limit, including the rate of +convergence of the conditional mean to the response function. We prove that the +maximum likelihood estimate of the smoothness parameter cannot asymptotically +undersmooth the truth when the data are obtained on a fixed bounded subset of +$\mathbb{R}^d$. That is, if the data-generating response function has Sobolev +smoothness $\nu_0 > d/2$, then the smoothness parameter estimate cannot be +asymptotically less than $\nu_0$. The lower bound is sharp. Additionally, we +show that maximum likelihood estimation recovers the true smoothness for a +class of compactly supported self-similar functions. For cross-validation we +prove an asymptotic lower bound $\nu_0 - d/2$, which however is unlikely to be +sharp. The results are based on approximation theory in Sobolev spaces and some +general theorems that restrict the set of values that the parameter estimators +can take. + +
+
+
+
+
+ + ♻ ☆ Dimensionality Reduction and Wasserstein Stability for Kernel Regression + + +
+ In a high-dimensional regression framework, we study consequences of the +naive two-step procedure where first the dimension of the input variables is +reduced and second, the reduced input variables are used to predict the output +variable with kernel regression. In order to analyze the resulting regression +errors, a novel stability result for kernel regression with respect to the +Wasserstein distance is derived. This allows us to bound errors that occur when +perturbed input data is used to fit the regression function. We apply the +general stability result to principal component analysis (PCA). Exploiting +known estimates from the literature on both principal component analysis and +kernel regression, we deduce convergence rates for the two-step procedure. The +latter turns out to be particularly useful in a semi-supervised setting. + +
+
+ comment: Forthcoming in JMLR +
+
+
+
+
+ + ♻ ☆ The Chosen One: Consistent Characters in Text-to-Image Diffusion Models + + +
+ Recent advances in text-to-image generation models have unlocked vast +potential for visual creativity. However, these models struggle with generation +of consistent characters, a crucial aspect for numerous real-world applications +such as story visualization, game development asset design, advertising, and +more. Current methods typically rely on multiple pre-existing images of the +target character or involve labor-intensive manual processes. In this work, we +propose a fully automated solution for consistent character generation, with +the sole input being a text prompt. We introduce an iterative procedure that, +at each stage, identifies a coherent set of images sharing a similar identity +and extracts a more consistent identity from this set. Our quantitative +analysis demonstrates that our method strikes a better balance between prompt +alignment and identity consistency compared to the baseline methods, and these +findings are reinforced by a user study. To conclude, we showcase several +practical applications of our approach. Project page is available at +https://omriavrahami.com/the-chosen-one + +
+
+ comment: Project page is available at https://omriavrahami.com/the-chosen-one +
+
+
+
+
+ + ♻ ☆ TorchRL: A data-driven decision-making library for PyTorch + + +
+ PyTorch has ascended as a premier machine learning framework, yet it lacks a +native and comprehensive library for decision and control tasks suitable for +large development teams dealing with complex real-world data and environments. +To address this issue, we propose TorchRL, a generalistic control library for +PyTorch that provides well-integrated, yet standalone components. We introduce +a new and flexible PyTorch primitive, the TensorDict, which facilitates +streamlined algorithm development across the many branches of Reinforcement +Learning (RL) and control. We provide a detailed description of the building +blocks and an extensive overview of the library across domains and tasks. +Finally, we experimentally demonstrate its reliability and flexibility and show +comparative benchmarks to demonstrate its computational efficiency. TorchRL +fosters long-term support and is publicly available on GitHub for greater +reproducibility and collaboration within the research community. The code is +open-sourced on GitHub. + +
+
+
+
+
+ + ♻ ☆ Energy Discrepancies: A Score-Independent Loss for Energy-Based Models NeurIPS 2023 + + +
+ Energy-based models are a simple yet powerful class of probabilistic models, +but their widespread adoption has been limited by the computational burden of +training them. We propose a novel loss function called Energy Discrepancy (ED) +which does not rely on the computation of scores or expensive Markov chain +Monte Carlo. We show that ED approaches the explicit score matching and +negative log-likelihood loss under different limits, effectively interpolating +between both. Consequently, minimum ED estimation overcomes the problem of +nearsightedness encountered in score-based estimation methods, while also +enjoying theoretical guarantees. Through numerical experiments, we demonstrate +that ED learns low-dimensional data distributions faster and more accurately +than explicit score matching or contrastive divergence. For high-dimensional +image data, we describe how the manifold hypothesis puts limitations on our +approach and demonstrate the effectiveness of energy discrepancy by training +the energy-based model as a prior of a variational decoder model. + +
+
+ comment: Camera Ready version for the 37th Conference on Neural Information + Processing Systems (NeurIPS 2023). Changes in this revision: Appendix A1: + Corrected proof of Theorem 1. Appendix D3: Added definition and numerical + experiments for energy discrepancy on binary discrete spaces. Minor changes + in the main text and correction of typos. Added new references +
+
+
+
+
+ + ♻ ☆ Technical Report: Large Language Models can Strategically Deceive their + Users when Put Under Pressure + + +
+ We demonstrate a situation in which Large Language Models, trained to be +helpful, harmless, and honest, can display misaligned behavior and +strategically deceive their users about this behavior without being instructed +to do so. Concretely, we deploy GPT-4 as an agent in a realistic, simulated +environment, where it assumes the role of an autonomous stock trading agent. +Within this environment, the model obtains an insider tip about a lucrative +stock trade and acts upon it despite knowing that insider trading is +disapproved of by company management. When reporting to its manager, the model +consistently hides the genuine reasons behind its trading decision. We perform +a brief investigation of how this behavior varies under changes to the setting, +such as removing model access to a reasoning scratchpad, attempting to prevent +the misaligned behavior by changing system instructions, changing the amount of +pressure the model is under, varying the perceived risk of getting caught, and +making other simple changes to the environment. To our knowledge, this is the +first demonstration of Large Language Models trained to be helpful, harmless, +and honest, strategically deceiving their users in a realistic situation +without direct instructions or training for deception. + +
+
+
+
+
+ + ♻ ☆ Assessing Deep Neural Networks as Probability Estimators + + +
+ Deep Neural Networks (DNNs) have performed admirably in classification tasks. +However, the characterization of their classification uncertainties, required +for certain applications, has been lacking. In this work, we investigate the +issue by assessing DNNs' ability to estimate conditional probabilities and +propose a framework for systematic uncertainty characterization. Denoting the +input sample as x and the category as y, the classification task of assigning a +category y to a given input x can be reduced to the task of estimating the +conditional probabilities p(y|x), as approximated by the DNN at its last layer +using the softmax function. Since softmax yields a vector whose elements all +fall in the interval (0, 1) and sum to 1, it suggests a probabilistic +interpretation to the DNN's outcome. Using synthetic and real-world datasets, +we look into the impact of various factors, e.g., probability density f(x) and +inter-categorical sparsity, on the precision of DNNs' estimations of p(y|x), +and find that the likelihood probability density and the inter-categorical +sparsity have greater impacts than the prior probability to DNNs' +classification uncertainty. + +
+
+ comment: Y. Pan, K. Kuo, M. Rilee and H. Yu, "Assessing Deep Neural Networks + as Probability Estimators," in 2021 IEEE International Conference on Big Data + (Big Data), Orlando, FL, USA, 2021 pp. 1083-1091. doi: + 10.1109/BigData52589.2021.9671328 +
+
+
+
+
+ + ♻ ☆ CALICO: Self-Supervised Camera-LiDAR Contrastive Pre-training for BEV + Perception + + +
+ Perception is crucial in the realm of autonomous driving systems, where +bird's eye view (BEV)-based architectures have recently reached +state-of-the-art performance. The desirability of self-supervised +representation learning stems from the expensive and laborious process of +annotating 2D and 3D data. Although previous research has investigated +pretraining methods for both LiDAR and camera-based 3D object detection, a +unified pretraining framework for multimodal BEV perception is missing. In this +study, we introduce CALICO, a novel framework that applies contrastive +objectives to both LiDAR and camera backbones. Specifically, CALICO +incorporates two stages: point-region contrast (PRC) and region-aware +distillation (RAD). PRC better balances the region- and scene-level +representation learning on the LiDAR modality and offers significant +performance improvement compared to existing methods. RAD effectively achieves +contrastive distillation on our self-trained teacher model. CALICO's efficacy +is substantiated by extensive evaluations on 3D object detection and BEV map +segmentation tasks, where it delivers significant performance improvements. +Notably, CALICO outperforms the baseline method by 10.5% and 8.6% on NDS and +mAP. Moreover, CALICO boosts the robustness of multimodal 3D object detection +against adversarial attacks and corruption. Additionally, our framework can be +tailored to different backbones and heads, positioning it as a promising +approach for multimodal BEV perception. + +
+
+
+
+
+ + ♻ ☆ RCT Rejection Sampling for Causal Estimation Evaluation + + +
+ Confounding is a significant obstacle to unbiased estimation of causal +effects from observational data. For settings with high-dimensional covariates +-- such as text data, genomics, or the behavioral social sciences -- +researchers have proposed methods to adjust for confounding by adapting machine +learning methods to the goal of causal estimation. However, empirical +evaluation of these adjustment methods has been challenging and limited. In +this work, we build on a promising empirical evaluation strategy that +simplifies evaluation design and uses real data: subsampling randomized +controlled trials (RCTs) to create confounded observational datasets while +using the average causal effects from the RCTs as ground-truth. We contribute a +new sampling algorithm, which we call RCT rejection sampling, and provide +theoretical guarantees that causal identification holds in the observational +data to allow for valid comparisons to the ground-truth RCT. Using synthetic +data, we show our algorithm indeed results in low bias when oracle estimators +are evaluated on the confounded samples, which is not always the case for a +previously proposed algorithm. In addition to this identification result, we +highlight several finite data considerations for evaluation designers who plan +to use RCT rejection sampling on their own datasets. As a proof of concept, we +implement an example evaluation pipeline and walk through these finite data +considerations with a novel, real-world RCT -- which we release publicly -- +consisting of approximately 70k observations and text data as high-dimensional +covariates. Together, these contributions build towards a broader agenda of +improved empirical evaluation for causal estimation. + +
+
+ comment: Code and data at https://github.com/kakeith/rct_rejection_sampling +
+
+
+
+
+ + ♻ ☆ Long-Range Neural Atom Learning for Molecular Graphs + + +
+ Graph Neural Networks (GNNs) have been widely adopted for drug discovery with +molecular graphs. Nevertheless, current GNNs are mainly good at leveraging +short-range interactions (SRI) but struggle to capture long-range interactions +(LRI), both of which are crucial for determining molecular properties. To +tackle this issue, we propose a method that implicitly projects all original +atoms into a few Neural Atoms, which abstracts the collective information of +atomic groups within a molecule. Specifically, we explicitly exchange the +information among neural atoms and project them back to the atoms' +representations as an enhancement. With this mechanism, neural atoms establish +the communication channels among distant nodes, effectively reducing the +interaction scope of arbitrary node pairs into a single hop. To provide an +inspection of our method from a physical perspective, we reveal its connection +with the traditional LRI calculation method, Ewald Summation. We conduct +extensive experiments on three long-range graph benchmarks, covering both +graph-level and link-level tasks on molecular graphs. We empirically justify +that our method can be equipped with an arbitrary GNN and help to capture LRI. + +
+
+
+
+
+ + ♻ ☆ AdaptGuard: Defending Against Universal Attacks for Model Adaptation ICCV2023 + + +
+ Model adaptation aims at solving the domain transfer problem under the +constraint of only accessing the pretrained source models. With the increasing +considerations of data privacy and transmission efficiency, this paradigm has +been gaining recent popularity. This paper studies the vulnerability to +universal attacks transferred from the source domain during model adaptation +algorithms due to the existence of malicious providers. We explore both +universal adversarial perturbations and backdoor attacks as loopholes on the +source side and discover that they still survive in the target models after +adaptation. To address this issue, we propose a model preprocessing framework, +named AdaptGuard, to improve the security of model adaptation algorithms. +AdaptGuard avoids direct use of the risky source parameters through knowledge +distillation and utilizes the pseudo adversarial samples under adjusted radius +to enhance the robustness. AdaptGuard is a plug-and-play module that requires +neither robust pretrained models nor any changes for the following model +adaptation algorithms. Extensive results on three commonly used datasets and +two popular adaptation methods validate that AdaptGuard can effectively defend +against universal attacks and maintain clean accuracy in the target domain +simultaneously. We hope this research will shed light on the safety and +robustness of transfer learning. Code is available at +https://github.com/TomSheng21/AdaptGuard. + +
+
+ comment: ICCV2023 +
+
+
+
+
+ + ♻ ☆ The Lipschitz-Variance-Margin Tradeoff for Enhanced Randomized Smoothing + + +
+ Real-life applications of deep neural networks are hindered by their unsteady +predictions when faced with noisy inputs and adversarial attacks. The certified +radius is in this context a crucial indicator of the robustness of models. +However how to design an efficient classifier with a sufficient certified +radius? Randomized smoothing provides a promising framework by relying on noise +injection in inputs to obtain a smoothed and more robust classifier. In this +paper, we first show that the variance introduced by randomized smoothing +closely interacts with two other important properties of the classifier, +\textit{i.e.} its Lipschitz constant and margin. More precisely, our work +emphasizes the dual impact of the Lipschitz constant of the base classifier, on +both the smoothed classifier and the empirical variance. Moreover, to increase +the certified robust radius, we introduce a different simplex projection +technique for the base classifier to leverage the variance-margin trade-off +thanks to Bernstein's concentration inequality, along with an enhanced +Lipschitz bound. Experimental results show a significant improvement in +certified accuracy compared to current state-of-the-art methods. Our novel +certification procedure allows us to use pre-trained models that are used with +randomized smoothing, effectively improving the current certification radius in +a zero-shot manner. + +
+
+
+
+
+ + ♻ ☆ Emerging Trends in Federated Learning: From Model Fusion to Federated X + Learning + + +
+ Federated learning is a new learning paradigm that decouples data collection +and model training via multi-party computation and model aggregation. As a +flexible learning setting, federated learning has the potential to integrate +with other learning frameworks. We conduct a focused survey of federated +learning in conjunction with other learning algorithms. Specifically, we +explore various learning algorithms to improve the vanilla federated averaging +algorithm and review model fusion methods such as adaptive aggregation, +regularization, clustered methods, and Bayesian methods. Following the emerging +trends, we also discuss federated learning in the intersection with other +learning paradigms, termed federated X learning, where X includes multitask +learning, meta-learning, transfer learning, unsupervised learning, and +reinforcement learning. This survey reviews the state of the art, challenges, +and future directions. + +
+
+
+
+
+ + ♻ ☆ Label Differential Privacy via Aggregation + + +
+ In many real-world applications, due to recent developments in the privacy +landscape, training data may be aggregated to preserve the privacy of sensitive +training labels. In the learning from label proportions (LLP) framework, the +dataset is partitioned into bags of feature-vectors which are available only +with the sum of the labels per bag. A further restriction, which we call +learning from bag aggregates (LBA) is where instead of individual +feature-vectors, only the (possibly weighted) sum of the feature-vectors per +bag is available. We study whether such aggregation techniques can provide +privacy guarantees under the notion of label differential privacy (label-DP) +previously studied in for e.g. [Chaudhuri-Hsu'11, Ghazi et al.'21, Esfandiari +et al.'22]. + It is easily seen that naive LBA and LLP do not provide label-DP. Our main +result however, shows that weighted LBA using iid Gaussian weights with $m$ +randomly sampled disjoint $k$-sized bags is in fact $(\varepsilon, +\delta)$-label-DP for any $\varepsilon > 0$ with $\delta \approx +\exp(-\Omega(\sqrt{k}))$ assuming a lower bound on the linear-mse regression +loss. Further, the $\ell_2^2$-regressor which minimizes the loss on the +aggregated dataset has a loss within $\left(1 + o(1)\right)$-factor of the +optimum on the original dataset w.p. $\approx 1 - exp(-\Omega(m))$. We +emphasize that no additive label noise is required. + The analogous weighted-LLP does not however admit label-DP. Nevertheless, we +show that if additive $N(0, 1)$ noise can be added to any constant fraction of +the instance labels, then the noisy weighted-LLP admits similar label-DP +guarantees without assumptions on the dataset, while preserving the utility of +Lipschitz-bounded neural mse-regression tasks. + Our work is the first to demonstrate that label-DP can be achieved by +randomly weighted aggregation for regression tasks, using no or little additive +noise. + +
+
+
+
+
+ + ♻ ☆ SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker + Recognition Systems NDSS + + +
+ Membership inference attacks allow adversaries to determine whether a +particular example was contained in the model's training dataset. While +previous works have confirmed the feasibility of such attacks in various +applications, none has focused on speaker recognition (SR), a promising +voice-based biometric recognition technique. In this work, we propose SLMIA-SR, +the first membership inference attack tailored to SR. In contrast to +conventional example-level attack, our attack features speaker-level membership +inference, i.e., determining if any voices of a given speaker, either the same +as or different from the given inference voices, have been involved in the +training of a model. It is particularly useful and practical since the training +and inference voices are usually distinct, and it is also meaningful +considering the open-set nature of SR, namely, the recognition speakers were +often not present in the training data. We utilize intra-similarity and +inter-dissimilarity, two training objectives of SR, to characterize the +differences between training and non-training speakers and quantify them with +two groups of features driven by carefully-established feature engineering to +mount the attack. To improve the generalizability of our attack, we propose a +novel mixing ratio training strategy to train attack models. To enhance the +attack performance, we introduce voice chunk splitting to cope with the limited +number of inference voices and propose to train attack models dependent on the +number of inference voices. Our attack is versatile and can work in both +white-box and black-box scenarios. Additionally, we propose two novel +techniques to reduce the number of black-box queries while maintaining the +attack performance. Extensive experiments demonstrate the effectiveness of +SLMIA-SR. + +
+
+ comment: In Proceedings of the 31st Network and Distributed System Security + (NDSS) Symposium, 2024 +
+
+
+
+
+ + ♻ ☆ The Map Equation Goes Neural + + +
+ Community detection and graph clustering are essential for unsupervised data +exploration and understanding the high-level organisation of networked systems. +Recently, graph clustering has received attention as a primary task for graph +neural networks. Although hierarchical graph pooling has been shown to improve +performance in graph and node classification tasks, it performs poorly in +identifying meaningful clusters. Community detection has a long history in +network science, but typically relies on optimising objective functions with +custom-tailored search algorithms, not leveraging recent advances in deep +learning, particularly from graph neural networks. In this paper, we narrow +this gap between the deep learning and network science communities. We consider +the map equation, an information-theoretic objective function for unsupervised +community detection. Expressing it in a fully differentiable tensor form that +produces soft cluster assignments, we optimise the map equation with deep +learning through gradient descent. More specifically, the reformulated map +equation is a loss function compatible with any graph neural network +architecture, enabling flexible clustering and graph pooling that clusters both +graph structure and data features in an end-to-end way, automatically finding +an optimum number of clusters without explicit regularisation by following the +minimum description length principle. We evaluate our approach experimentally +using different neural network architectures for unsupervised clustering in +synthetic and real data. Our results show that our approach achieves +competitive performance against baselines, naturally detects overlapping +communities, and avoids over-partitioning sparse graphs. + +
+
+
+
+
+ + ♻ ☆ Aggregating Capacity in FL through Successive Layer Training for + Computationally-Constrained Devices NeurIPS'23 + + +
+ Federated learning (FL) is usually performed on resource-constrained edge +devices, e.g., with limited memory for the computation. If the required memory +to train a model exceeds this limit, the device will be excluded from the +training. This can lead to a lower accuracy as valuable data and computation +resources are excluded from training, also causing bias and unfairness. The FL +training process should be adjusted to such constraints. The state-of-the-art +techniques propose training subsets of the FL model at constrained devices, +reducing their resource requirements for training. But these techniques largely +limit the co-adaptation among parameters of the model and are highly +inefficient, as we show: it is actually better to train a smaller (less +accurate) model by the system where all the devices can train the model +end-to-end, than applying such techniques. We propose a new method that enables +successive freezing and training of the parameters of the FL model at devices, +reducing the training's resource requirements at the devices, while still +allowing enough co-adaptation between parameters. We show through extensive +experimental evaluation that our technique greatly improves the accuracy of the +trained model (by 52.4 p.p.) compared with the state of the art, efficiently +aggregating the computation capacity available on distributed devices. + +
+
+ comment: accepted at NeurIPS'23 +
+
+
+
+
+ + ♻ ☆ An efficient likelihood-free Bayesian inference method based on + sequential neural posterior estimation + + +
+ Sequential neural posterior estimation (SNPE) techniques have been recently +proposed for dealing with simulation-based models with intractable likelihoods. +Unlike approximate Bayesian computation, SNPE techniques learn the posterior +from sequential simulation using neural network-based conditional density +estimators by minimizing a specific loss function. The SNPE method proposed by +Lueckmann et al. (2017) used a calibration kernel to boost the sample weights +around the observed data, resulting in a concentrated loss function. However, +the use of calibration kernels may increase the variances of both the empirical +loss and its gradient, making the training inefficient. To improve the +stability of SNPE, this paper proposes to use an adaptive calibration kernel +and several variance reduction techniques. The proposed method greatly speeds +up the process of training, and provides a better approximation of the +posterior than the original SNPE method and some existing competitors as +confirmed by numerical experiments. + +
+
+ comment: 30 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Empirical Study of PEFT techniques for Winter Wheat Segmentation + + +
+ Parameter Efficient Fine Tuning (PEFT) techniques have recently experienced +significant growth and have been extensively employed to adapt large vision and +language models to various domains, enabling satisfactory model performance +with minimal computational needs. Despite these advances, more research has yet +to delve into potential PEFT applications in real-life scenarios, particularly +in the critical domains of remote sensing and crop monitoring. The diversity of +climates across different regions and the need for comprehensive large-scale +datasets have posed significant obstacles to accurately identify crop types +across varying geographic locations and changing growing seasons. This study +seeks to bridge this gap by comprehensively exploring the feasibility of +cross-area and cross-year out-of-distribution generalization using the +State-of-the-Art (SOTA) wheat crop monitoring model. The aim of this work is to +explore PEFT approaches for crop monitoring. Specifically, we focus on adapting +the SOTA TSViT model to address winter wheat field segmentation, a critical +task for crop monitoring and food security. This adaptation process involves +integrating different PEFT techniques, including BigFit, LoRA, Adaptformer, and +prompt tuning. Using PEFT techniques, we achieved notable results comparable to +those achieved using full fine-tuning methods while training only a mere 0.7% +parameters of the whole TSViT architecture. The in-house labeled data-set, +referred to as the Beqaa-Lebanon dataset, comprises high-quality annotated +polygons for wheat and non-wheat classes with a total surface of 170 kmsq, over +five consecutive years. Using Sentinel-2 images, our model achieved a 84% +F1-score. We intend to publicly release the Lebanese winter wheat data set, +code repository, and model weights. + +
+
+
+
+
+ + ♻ ☆ Computational and Storage Efficient Quadratic Neurons for Deep Neural + Networks DATE + + +
+ Deep neural networks (DNNs) have been widely deployed across diverse domains +such as computer vision and natural language processing. However, the +impressive accomplishments of DNNs have been realized alongside extensive +computational demands, thereby impeding their applicability on +resource-constrained devices. To address this challenge, many researchers have +been focusing on basic neuron structures, the fundamental building blocks of +neural networks, to alleviate the computational and storage cost. In this work, +an efficient quadratic neuron architecture distinguished by its enhanced +utilization of second-order computational information is introduced. By virtue +of their better expressivity, DNNs employing the proposed quadratic neurons can +attain similar accuracy with fewer neurons and computational cost. Experimental +results have demonstrated that the proposed quadratic neuron structure exhibits +superior computational and storage efficiency across various tasks when +compared with both linear and non-linear neurons in prior work. + +
+
+ comment: Accepted by Design Automation and Test in Europe (DATE) 2024 +
+
+
+
+
+ + ♻ ☆ RankFeat&RankWeight: Rank-1 Feature/Weight Removal for + Out-of-distribution Detection + + +
+ The task of out-of-distribution (OOD) detection is crucial for deploying +machine learning models in real-world settings. In this paper, we observe that +the singular value distributions of the in-distribution (ID) and OOD features +are quite different: the OOD feature matrix tends to have a larger dominant +singular value than the ID feature, and the class predictions of OOD samples +are largely determined by it. This observation motivates us to propose +\texttt{RankFeat}, a simple yet effective \emph{post hoc} approach for OOD +detection by removing the rank-1 matrix composed of the largest singular value +and the associated singular vectors from the high-level feature. +\texttt{RankFeat} achieves \emph{state-of-the-art} performance and reduces the +average false positive rate (FPR95) by 17.90\% compared with the previous best +method. The success of \texttt{RankFeat} motivates us to investigate whether a +similar phenomenon would exist in the parameter matrices of neural networks. We +thus propose \texttt{RankWeight} which removes the rank-1 weight from the +parameter matrices of a single deep layer. Our \texttt{RankWeight}is also +\emph{post hoc} and only requires computing the rank-1 matrix once. As a +standalone approach, \texttt{RankWeight} has very competitive performance +against other methods across various backbones. Moreover, \texttt{RankWeight} +enjoys flexible compatibility with a wide range of OOD detection methods. The +combination of \texttt{RankWeight} and \texttt{RankFeat} refreshes the new +\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\% on +the ImageNet-1k benchmark. Extensive ablation studies and comprehensive +theoretical analyses are presented to support the empirical results. + +
+
+ comment: submitted to T-PAMI. arXiv admin note: substantial text overlap with + arXiv:2209.08590 +
+
+
+
+
+ + ♻ ☆ Improved identification accuracy in equation learning via comprehensive + $\boldsymbol{R^2}$-elimination and Bayesian model selection + + +
+ In the field of equation learning, exhaustively considering all possible +equations derived from a basis function dictionary is infeasible. Sparse +regression and greedy algorithms have emerged as popular approaches to tackle +this challenge. However, the presence of multicollinearity poses difficulties +for sparse regression techniques, and greedy steps may inadvertently exclude +terms of the true equation, leading to reduced identification accuracy. In this +article, we present an approach that strikes a balance between +comprehensiveness and efficiency in equation learning. Inspired by stepwise +regression, our approach combines the coefficient of determination, $R^2$, and +the Bayesian model evidence, $p(\boldsymbol y|\mathcal M)$, in a novel way. Our +procedure is characterized by a comprehensive search with just a minor +reduction of the model space at each iteration step. With two flavors of our +approach and the adoption of $p(\boldsymbol y|\mathcal M)$ for bi-directional +stepwise regression, we present a total of three new avenues for equation +learning. Through three extensive numerical experiments involving random +polynomials and dynamical systems, we compare our approach against four +state-of-the-art methods and two standard approaches. The results demonstrate +that our comprehensive search approach surpasses all other methods in terms of +identification accuracy. In particular, the second flavor of our approach +establishes an efficient overfitting penalty solely based on $R^2$, which +achieves highest rates of exact equation recovery. + +
+
+ comment: 12 pages main text and 11 pages appendix, Published in TMLR + (https://openreview.net/forum?id=0ck7hJ8EVC) +
+
+
+
+
+ + ♻ ☆ Uncovering the Hidden Cost of Model Compression + + +
+ In the era of resource-intensive foundation models, efficient adaptation in +downstream tasks has become paramount. Visual Prompting (VP), inspired by +prompting in Large Language Models (LLMs), has emerged as a key transfer +learning method in computer vision. Aligned with the growing significance of +efficiency, research in model compression has become pivotal to alleviate the +computational burden in both training and deploying over-parameterized neural +networks. A key goal in model compression is the development of sparse models +capable of matching or surpassing the performance of their over-parameterized, +dense counterparts. While prior research has explored the impact of model +sparsity on transfer learning, its effects on visual prompting-based transfer +remain unclear. This study addresses this gap, revealing that model sparsity +adversely affects the performance of visual prompting-based transfer, +particularly in low-data-volume scenarios. Furthermore, our findings highlight +the negative influence of sparsity on the calibration of downstream +visual-prompted models. This empirical exploration calls for a nuanced +understanding beyond accuracy in sparse settings, opening avenues for further +research in Visual Prompting for sparse models. Code and logs can be accessed +at https://github.com/landskape-ai/Reprogram_LT . + +
+
+ comment: Preprint +
+
+
+
+
+ + ♻ ☆ Artificial Neural Networks generated by Low Discrepancy Sequences + + +
+ Artificial neural networks can be represented by paths. Generated as random +walks on a dense network graph, we find that the resulting sparse networks +allow for deterministic initialization and even weights with fixed sign. Such +networks can be trained sparse from scratch, avoiding the expensive procedure +of training a dense network and compressing it afterwards. Although sparse, +weights are accessed as contiguous blocks of memory. In addition, enumerating +the paths using deterministic low discrepancy sequences, for example the Sobol' +sequence, amounts to connecting the layers of neural units by progressive +permutations, which naturally avoids bank conflicts in parallel computer +hardware. We demonstrate that the artificial neural networks generated by low +discrepancy sequences can achieve an accuracy within reach of their dense +counterparts at a much lower computational complexity. + +
+
+
+
+
+ + ♻ ☆ Efficient Gradient Estimation via Adaptive Sampling and Importance + Sampling + + +
+ Machine learning problems rely heavily on stochastic gradient descent (SGD) +for optimization. The effectiveness of SGD is contingent upon accurately +estimating gradients from a mini-batch of data samples. Instead of the commonly +used uniform sampling, adaptive or importance sampling reduces noise in +gradient estimation by forming mini-batches that prioritize crucial data +points. Previous research has suggested that data points should be selected +with probabilities proportional to their gradient norm. Nevertheless, existing +algorithms have struggled to efficiently integrate importance sampling into +machine learning frameworks. In this work, we make two contributions. First, we +present an algorithm that can incorporate existing importance functions into +our framework. Second, we propose a simplified importance function that relies +solely on the loss gradient of the output layer. By leveraging our proposed +gradient estimation techniques, we observe improved convergence in +classification and regression tasks with minimal computational overhead. We +validate the effectiveness of our adaptive and importance-sampling approach on +image and point-cloud datasets. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of + Continuous Random Variables + + +
+ In this paper, I present a completely new type of upper and lower bounds on +the right-tail probabilities of continuous random variables with unbounded +support and with semi-bounded support from the left. The presented upper and +lower right-tail bounds depend only on the probability density function (PDF), +its first derivative, and two parameters that are used for tightening the +bounds. These tail bounds hold under certain conditions that depend on the PDF, +its first and second derivatives, and the two parameters. The new tail bounds +are shown to be tight for a wide range of continuous random variables via +numerical examples. + +
+
+ comment: Minor typos corrected v2 +
+
+
+
+
+ + ♻ ☆ Unleashing the Power of Pre-trained Language Models for Offline + Reinforcement Learning + + +
+ Offline reinforcement learning (RL) aims to find a near-optimal policy using +pre-collected datasets. In real-world scenarios, data collection could be +costly and risky; therefore, offline RL becomes particularly challenging when +the in-domain data is limited. Given recent advances in Large Language Models +(LLMs) and their few-shot learning prowess, this paper introduces +$\textbf{La}$nguage Models for $\textbf{Mo}$tion Control ($\textbf{LaMo}$), a +general framework based on Decision Transformers to effectively use pre-trained +Language Models (LMs) for offline RL. Our framework highlights four crucial +components: (1) Initializing Decision Transformers with sequentially +pre-trained LMs, (2) employing the LoRA fine-tuning method, in contrast to +full-weight fine-tuning, to combine the pre-trained knowledge from LMs and +in-domain knowledge effectively, (3) using the non-linear MLP transformation +instead of linear projections, to generate embeddings, and (4) integrating an +auxiliary language prediction loss during fine-tuning to stabilize the LMs and +retain their original abilities on languages. Empirical results indicate +$\textbf{LaMo}$ achieves state-of-the-art performance in sparse-reward tasks +and closes the gap between value-based offline RL methods and decision +transformers in dense-reward tasks. In particular, our method demonstrates +superior performance in scenarios with limited data samples. + +
+
+ comment: 24 pages, 16 tables +
+
+
+
+
+ + ♻ ☆ A Comparison of PDF Projection with Normalizing Flows and SurVAE + + +
+ Normalizing flows (NF) recently gained attention as a way to construct +generative networks with exact likelihood calculation out of composable layers. +However, NF is restricted to dimension-preserving transformations. Surjection +VAE (SurVAE) has been proposed to extend NF to dimension-altering +transformations. Such networks are desirable because they are expressive and +can be precisely trained. We show that the approaches are a re-invention of PDF +projection, which appeared over twenty years earlier and is much further +developed. + +
+
+
+
+
+ + ♻ ☆ Token-Level Adversarial Prompt Detection Based on Perplexity Measures + and Contextual Information + + +
+ In recent years, Large Language Models (LLM) have emerged as pivotal tools in +various applications. However, these models are susceptible to adversarial +prompt attacks, where attackers can carefully curate input strings that lead to +undesirable outputs. The inherent vulnerability of LLMs stems from their +input-output mechanisms, especially when presented with intensely +out-of-distribution (OOD) inputs. This paper proposes a token-level detection +method to identify adversarial prompts, leveraging the LLM's capability to +predict the next token's probability. We measure the degree of the model's +perplexity and incorporate neighboring token information to encourage the +detection of contiguous adversarial prompt sequences. As a result, we propose +two methods: one that identifies each token as either being part of an +adversarial prompt or not, and another that estimates the probability of each +token being part of an adversarial prompt. + +
+
+
+
+
+ + ♻ ☆ The Open DAC 2023 Dataset and Challenges for Sorbent Discovery in Direct + Air Capture + + +
+ New methods for carbon dioxide removal are urgently needed to combat global +climate change. Direct air capture (DAC) is an emerging technology to capture +carbon dioxide directly from ambient air. Metal-organic frameworks (MOFs) have +been widely studied as potentially customizable adsorbents for DAC. However, +discovering promising MOF sorbents for DAC is challenging because of the vast +chemical space to explore and the need to understand materials as functions of +humidity and temperature. We explore a computational approach benefiting from +recent innovations in machine learning (ML) and present a dataset named Open +DAC 2023 (ODAC23) consisting of more than 38M density functional theory (DFT) +calculations on more than 8,400 MOF materials containing adsorbed $CO_2$ and/or +$H_2O$. ODAC23 is by far the largest dataset of MOF adsorption calculations at +the DFT level of accuracy currently available. In addition to probing +properties of adsorbed molecules, the dataset is a rich source of information +on structural relaxation of MOFs, which will be useful in many contexts beyond +specific applications for DAC. A large number of MOFs with promising properties +for DAC are identified directly in ODAC23. We also trained state-of-the-art ML +models on this dataset to approximate calculations at the DFT level. This +open-source dataset and our initial ML models will provide an important +baseline for future efforts to identify MOFs for a wide range of applications, +including DAC. + +
+
+
+
+
+ + ♻ ☆ TODM: Train Once Deploy Many Efficient Supernet-Based RNN-T Compression + For On-device ASR Models ICASSP 2024 + + +
+ Automatic Speech Recognition (ASR) models need to be optimized for specific +hardware before they can be deployed on devices. This can be done by tuning the +model's hyperparameters or exploring variations in its architecture. +Re-training and re-validating models after making these changes can be a +resource-intensive task. This paper presents TODM (Train Once Deploy Many), a +new approach to efficiently train many sizes of hardware-friendly on-device ASR +models with comparable GPU-hours to that of a single training job. TODM +leverages insights from prior work on Supernet, where Recurrent Neural Network +Transducer (RNN-T) models share weights within a Supernet. It reduces layer +sizes and widths of the Supernet to obtain subnetworks, making them smaller +models suitable for all hardware types. We introduce a novel combination of +three techniques to improve the outcomes of the TODM Supernet: adaptive +dropouts, an in-place Alpha-divergence knowledge distillation, and the use of +ScaledAdam optimizer. We validate our approach by comparing Supernet-trained +versus individually tuned Multi-Head State Space Model (MH-SSM) RNN-T using +LibriSpeech. Results demonstrate that our TODM Supernet either matches or +surpasses the performance of manually tuned models by up to a relative of 3% +better in word error rate (WER), while efficiently keeping the cost of training +many models at a small constant. + +
+
+ comment: Meta AI; Submitted to ICASSP 2024 +
+
+
+
+
+ + ♻ ☆ StratMed: Relevance Stratification between Biomedical Entities for + Sparsity on Medication Recommendation + + +
+ With the growing imbalance between limited medical resources and escalating +demands, AI-based clinical tasks have become paramount. As a sub-domain, +medication recommendation aims to amalgamate longitudinal patient history with +medical knowledge, assisting physicians in prescribing safer and more accurate +medication combinations. Existing works ignore the inherent long-tailed +distribution of medical data, have uneven learning strengths for hot and sparse +data, and fail to balance safety and accuracy. To address the above +limitations, we propose StratMed, which introduces a stratification strategy +that overcomes the long-tailed problem and achieves fuller learning of sparse +data. It also utilizes a dual-property network to address the issue of mutual +constraints on the safety and accuracy of medication combinations, +synergistically enhancing these two properties. Specifically, we construct a +pre-training method using deep learning networks to obtain medication and +disease representations. After that, we design a pyramid-like stratification +method based on relevance to strengthen the expressiveness of sparse data. +Based on this relevance, we design two graph structures to express medication +safety and precision at the same level to obtain patient representations. +Finally, the patient's historical clinical information is fitted to generate +medication combinations for the current health condition. We employed the +MIMIC-III dataset to evaluate our model against state-of-the-art methods in +three aspects comprehensively. Compared to the sub-optimal baseline model, our +model reduces safety risk by 15.08\%, improves accuracy by 0.36\%, and reduces +training time consumption by 81.66\%. + +
+
+
+
+
+ + ♻ ☆ Auto-PINN: Understanding and Optimizing Physics-Informed Neural + Architecture + + +
+ Physics-informed neural networks (PINNs) are revolutionizing science and +engineering practice by bringing together the power of deep learning to bear on +scientific computation. In forward modeling problems, PINNs are meshless +partial differential equation (PDE) solvers that can handle irregular, +high-dimensional physical domains. Naturally, the neural architecture +hyperparameters have a large impact on the efficiency and accuracy of the PINN +solver. However, this remains an open and challenging problem because of the +large search space and the difficulty of identifying a proper search objective +for PDEs. Here, we propose Auto-PINN, the first systematic, automated +hyperparameter optimization approach for PINNs, which employs Neural +Architecture Search (NAS) techniques to PINN design. Auto-PINN avoids manually +or exhaustively searching the hyperparameter space associated with PINNs. A +comprehensive set of pre-experiments using standard PDE benchmarks allows us to +probe the structure-performance relationship in PINNs. We find that the +different hyperparameters can be decoupled, and that the training loss function +of PINNs is a good search objective. Comparison experiments with baseline +methods demonstrate that Auto-PINN produces neural architectures with superior +stability and accuracy over alternative baselines. + +
+
+
+
+
+ + ♻ ☆ FedSoL: Bridging Global Alignment and Local Generality in Federated + Learning + + +
+ Federated Learning (FL) aggregates locally trained models from individual +clients to construct a global model. While FL enables learning a model with +data privacy, it often suffers from significant performance degradation when +client data distributions are heterogeneous. Many previous FL algorithms have +addressed this issue by introducing various proximal restrictions. These +restrictions aim to encourage global alignment by constraining the deviation of +local learning from the global objective. However, they inherently limit local +learning by interfering with the original local objectives. Recently, an +alternative approach has emerged to improve local learning generality. By +obtaining local models within a smooth loss landscape, this approach mitigates +conflicts among different local objectives of the clients. Yet, it does not +ensure stable global alignment, as local learning does not take the global +objective into account. In this study, we propose Federated Stability on +Learning (FedSoL), which combines both the concepts of global alignment and +local generality. In FedSoL, the local learning seeks a parameter region robust +against proximal perturbations. This strategy introduces an implicit proximal +restriction effect in local learning while maintaining the original local +objective for parameter update. Our experiments show that FedSoL consistently +achieves state-of-the-art performance on various setups. + +
+
+ comment: 19 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ A review of ensemble learning and data augmentation models for class + imbalanced problems: combination, implementation and evaluation + + +
+ Class imbalance (CI) in classification problems arises when the number of +observations belonging to one class is lower than the other. Ensemble learning +combines multiple models to obtain a robust model and has been prominently used +with data augmentation methods to address class imbalance problems. In the last +decade, a number of strategies have been added to enhance ensemble learning and +data augmentation methods, along with new methods such as generative +adversarial networks (GANs). A combination of these has been applied in many +studies, and the evaluation of different combinations would enable a better +understanding and guidance for different application domains. In this paper, we +present a computational study to evaluate data augmentation and ensemble +learning methods used to address prominent benchmark CI problems. We present a +general framework that evaluates 9 data augmentation and 9 ensemble learning +methods for CI problems. Our objective is to identify the most effective +combination for improving classification performance on imbalanced datasets. +The results indicate that combinations of data augmentation methods with +ensemble learning can significantly improve classification performance on +imbalanced datasets. We find that traditional data augmentation methods such as +the synthetic minority oversampling technique (SMOTE) and random oversampling +(ROS) are not only better in performance for selected CI problems, but also +computationally less expensive than GANs. Our study is vital for the +development of novel models for handling imbalanced datasets. + +
+
+
+
+
+ + ♻ ☆ Redefining Super-Resolution: Fine-mesh PDE predictions without classical + simulations NeurIPS 2023 + + +
+ In Computational Fluid Dynamics (CFD), coarse mesh simulations offer +computational efficiency but often lack precision. Applying conventional +super-resolution to these simulations poses a significant challenge due to the +fundamental contrast between downsampling high-resolution images and +authentically emulating low-resolution physics. The former method conserves +more of the underlying physics, surpassing the usual constraints of real-world +scenarios. We propose a novel definition of super-resolution tailored for +PDE-based problems. Instead of simply downsampling from a high-resolution +dataset, we use coarse-grid simulated data as our input and predict fine-grid +simulated outcomes. Employing a physics-infused UNet upscaling method, we +demonstrate its efficacy across various 2D-CFD problems such as discontinuity +detection in Burger's equation, Methane combustion, and fouling in Industrial +heat exchangers. Our method enables the generation of fine-mesh solutions +bypassing traditional simulation, ensuring considerable computational saving +and fidelity to the original ground truth outcomes. Through diverse boundary +conditions during training, we further establish the robustness of our method, +paving the way for its broad applications in engineering and scientific CFD +solvers. + +
+
+ comment: Accepted at Machine Learning and the Physical Sciences Workshop, + NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Directional Privacy for Deep Learning + + +
+ Differentially Private Stochastic Gradient Descent (DP-SGD) is a key method +for applying privacy in the training of deep learning models. It applies +isotropic Gaussian noise to gradients during training, which can perturb these +gradients in any direction, damaging utility. Metric DP, however, can provide +alternative mechanisms based on arbitrary metrics that might be more suitable +for preserving utility. In this paper, we apply \textit{directional privacy}, +via a mechanism based on the von Mises-Fisher (VMF) distribution, to perturb +gradients in terms of \textit{angular distance} so that gradient direction is +broadly preserved. We show that this provides both $\epsilon$-DP and $\epsilon +d$-privacy for deep learning training, rather than the $(\epsilon, +\delta)$-privacy of the Gaussian mechanism. Experiments on key datasets then +indicate that the VMF mechanism can outperform the Gaussian in the +utility-privacy trade-off. In particular, our experiments provide a direct +empirical comparison of privacy between the two approaches in terms of their +ability to defend against reconstruction and membership inference. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Real Time GAZED: Online Shot Selection and Editing of Virtual Cameras + from Wide-Angle Monocular Video Recordings + + +
+ Eliminating time-consuming post-production processes and delivering +high-quality videos in today's fast-paced digital landscape are the key +advantages of real-time approaches. To address these needs, we present Real +Time GAZED: a real-time adaptation of the GAZED framework integrated with +CineFilter, a novel real-time camera trajectory stabilization approach. It +enables users to create professionally edited videos in real-time. Comparative +evaluations against baseline methods, including the non-real-time GAZED, +demonstrate that Real Time GAZED achieves similar editing results, ensuring +high-quality video output. Furthermore, a user study confirms the aesthetic +quality of the video edits produced by the Real Time GAZED approach. With these +advancements in real-time camera trajectory optimization and video editing +presented, the demand for immediate and dynamic content creation in industries +such as live broadcasting, sports coverage, news reporting, and social media +content creation can be met more efficiently. + +
+
+
+
+
+ + ☆ EAFP-Med: An Efficient Adaptive Feature Processing Module Based on + Prompts for Medical Image Detection + + +
+ In the face of rapid advances in medical imaging, cross-domain adaptive +medical image detection is challenging due to the differences in lesion +representations across various medical imaging technologies. To address this +issue, we draw inspiration from large language models to propose EAFP-Med, an +efficient adaptive feature processing module based on prompts for medical image +detection. EAFP-Med can efficiently extract lesion features of different scales +from a diverse range of medical images based on prompts while being flexible +and not limited by specific imaging techniques. Furthermore, it serves as a +feature preprocessing module that can be connected to any model front-end to +enhance the lesion features in input images. Moreover, we propose a novel +adaptive disease detection model named EAFP-Med ST, which utilizes the Swin +Transformer V2 - Tiny (SwinV2-T) as its backbone and connects it to EAFP-Med. +We have compared our method to nine state-of-the-art methods. Experimental +results demonstrate that EAFP-Med ST achieves the best performance on all three +datasets (chest X-ray images, cranial magnetic resonance imaging images, and +skin images). EAFP-Med can efficiently extract lesion features from various +medical images based on prompts, enhancing the model's performance. This holds +significant potential for improving medical image analysis and diagnosis. + +
+
+
+
+
+ + ☆ Automatic Time Signature Determination for New Scores Using Lyrics for + Latent Rhythmic Structure + + +
+ There has recently been a sharp increase in interest in Artificial +Intelligence-Generated Content (AIGC). Despite this, musical components such as +time signatures have not been studied sufficiently to form an algorithmic +determination approach for new compositions, especially lyrical songs. This is +likely because of the neglect of musical details, which is critical for +constructing a robust framework. Specifically, time signatures establish the +fundamental rhythmic structure for almost all aspects of a song, including the +phrases and notes. In this paper, we propose a novel approach that only uses +lyrics as input to automatically generate a fitting time signature for lyrical +songs and uncover the latent rhythmic structure utilizing explainable machine +learning models. In particular, we devise multiple methods that are associated +with discovering lyrical patterns and creating new features that simultaneously +contain lyrical, rhythmic, and statistical information. In this approach, the +best of our experimental results reveal a 97.6% F1 score and a 0.996 Area Under +the Curve (AUC) of the Receiver Operating Characteristic (ROC) score. In +conclusion, our research directly generates time signatures from lyrics +automatically for new scores utilizing machine learning, which is an innovative +idea that approaches an understudied component of musicology and therefore +contributes significantly to the future of Artificial Intelligence (AI) music +generation. + +
+
+ comment: Submitted to IEEE Big Data 2023 Conference +
+
+
+
+
+ + ♻ ☆ A Closer Look at Audio-Visual Segmentation + + +
+ Audio-visual segmentation (AVS) is a complex task that involves accurately +segmenting the corresponding sounding object based on audio-visual queries. +Successful audio-visual learning requires two essential components: 1) an +unbiased dataset with high-quality pixel-level multi-class labels, and 2) a +model capable of effectively linking audio information with its corresponding +visual object. However, these two requirements are only partially addressed by +current methods, with training sets containing biased audio-visual data, and +models that generalise poorly beyond this biased training set. In this work, we +propose a new strategy to build cost-effective and relatively unbiased +audio-visual semantic segmentation benchmarks. Our strategy, called Visual +Post-production (VPO), explores the observation that it is not necessary to +have explicit audio-visual pairs extracted from single video sources to build +such benchmarks. We also refine the previously proposed AVSBench to transform +it into the audio-visual semantic segmentation benchmark AVSBench-Single+. +Furthermore, this paper introduces a new pixel-wise audio-visual contrastive +learning method to enable a better generalisation of the model beyond the +training set. We verify the validity of the VPO strategy by showing that +state-of-the-art (SOTA) models trained with datasets built by matching audio +and visual data from different sources or with datasets containing audio and +visual data from the same video source produce almost the same accuracy. Then, +using the proposed VPO benchmarks and AVSBench-Single+, we show that our method +produces more accurate audio-visual semantic segmentation than SOTA models. +Code and dataset will be available. + +
+
+
+
+
+ + ♻ ☆ HierSpeech++: Bridging the Gap between Semantic and Acoustic + Representation of Speech by Hierarchical Variational Inference for Zero-shot + Speech Synthesis + + +
+ Large language models (LLM)-based speech synthesis has been widely adopted in +zero-shot speech synthesis. However, they require a large-scale data and +possess the same limitations as previous autoregressive speech models, +including slow inference speed and lack of robustness. This paper proposes +HierSpeech++, a fast and strong zero-shot speech synthesizer for text-to-speech +(TTS) and voice conversion (VC). We verified that hierarchical speech synthesis +frameworks could significantly improve the robustness and expressiveness of the +synthetic speech. Furthermore, we significantly improve the naturalness and +speaker similarity of synthetic speech even in zero-shot speech synthesis +scenarios. For text-to-speech, we adopt the text-to-vec framework, which +generates a self-supervised speech representation and an F0 representation +based on text representations and prosody prompts. Then, HierSpeech++ generates +speech from the generated vector, F0, and voice prompt. We further introduce a +high-efficient speech super-resolution framework from 16 kHz to 48 kHz. The +experimental results demonstrated that the hierarchical variational autoencoder +could be a strong zero-shot speech synthesizer given that it outperforms +LLM-based and diffusion-based models. Moreover, we achieved the first +human-level quality zero-shot speech synthesis. Audio samples and source code +are available at https://github.com/sh-lee-prml/HierSpeechpp. + +
+
+ comment: 16 pages, 9 figures, 12 tables +
+
+
+
+
+ + ♻ ☆ SLMIA-SR: Speaker-Level Membership Inference Attacks against Speaker + Recognition Systems NDSS + + +
+ Membership inference attacks allow adversaries to determine whether a +particular example was contained in the model's training dataset. While +previous works have confirmed the feasibility of such attacks in various +applications, none has focused on speaker recognition (SR), a promising +voice-based biometric recognition technique. In this work, we propose SLMIA-SR, +the first membership inference attack tailored to SR. In contrast to +conventional example-level attack, our attack features speaker-level membership +inference, i.e., determining if any voices of a given speaker, either the same +as or different from the given inference voices, have been involved in the +training of a model. It is particularly useful and practical since the training +and inference voices are usually distinct, and it is also meaningful +considering the open-set nature of SR, namely, the recognition speakers were +often not present in the training data. We utilize intra-similarity and +inter-dissimilarity, two training objectives of SR, to characterize the +differences between training and non-training speakers and quantify them with +two groups of features driven by carefully-established feature engineering to +mount the attack. To improve the generalizability of our attack, we propose a +novel mixing ratio training strategy to train attack models. To enhance the +attack performance, we introduce voice chunk splitting to cope with the limited +number of inference voices and propose to train attack models dependent on the +number of inference voices. Our attack is versatile and can work in both +white-box and black-box scenarios. Additionally, we propose two novel +techniques to reduce the number of black-box queries while maintaining the +attack performance. Extensive experiments demonstrate the effectiveness of +SLMIA-SR. + +
+
+ comment: In Proceedings of the 31st Network and Distributed System Security + (NDSS) Symposium, 2024 +
+
+
+
+
+ + ♻ ☆ Archiving Body Movements: Collective Generation of Chinese Calligraphy + + +
+ As a communication channel, body movements have been widely explored in +behavioral studies and kinesics. Performing and visual arts share the same +interests but focus on documenting and representing human body movements, such +as for dance notation and visual work creation. This paper investigates body +movements in oriental calligraphy and how to apply calligraphy principles to +stimulate and archive body movements. Through an artwork (Wushu), the authors +experiment with an interactive and generative approach to engage the audience's +bodily participation and archive the body movements as a compendium of +generated calligraphy. The audience assumes the role of both writers and +readers; creating ("writing") and appreciating ("reading") the generated +calligraphy becomes a cyclical process within this infinite "Book," which can +motivate further attention and discussions concerning Chinese characters and +calligraphy. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 22 + +
+
+
+ + ☆ Uncertainty-aware Language Modeling for Selective Question Answering + + +
+ We present an automatic large language model (LLM) conversion approach that +produces uncertainty-aware LLMs capable of estimating uncertainty with every +prediction. Our approach is model- and data-agnostic, is +computationally-efficient, and does not rely on external models or systems. We +evaluate converted models on the selective question answering setting -- to +answer as many questions as possible while maintaining a given accuracy, +forgoing providing predictions when necessary. As part of our results, we test +BERT and Llama 2 model variants on the SQuAD extractive QA task and the +TruthfulQA generative QA task. We show that using the uncertainty estimates +provided by our approach to selectively answer questions leads to significantly +higher accuracy over directly using model probabilities. + +
+
+
+
+
+ + ☆ Learning to Skip for Language Modeling + + +
+ Overparameterized large-scale language models have impressive generalization +performance of in-context few-shot learning. However, most language models +allocate the same amount of parameters or computation to each token, +disregarding the complexity or importance of the input data. We argue that in +language model pretraining, a variable amount of computation should be assigned +to different tokens, and this can be efficiently achieved via a simple routing +mechanism. Different from conventional early stopping techniques where tokens +can early exit at only early layers, we propose a more general method that +dynamically skips the execution of a layer (or module) for any input token with +a binary router. In our extensive evaluation across 24 NLP tasks, we +demonstrate that the proposed method can significantly improve the 1-shot +performance compared to other competitive baselines only at mild extra cost for +inference. + +
+
+
+
+
+ + ☆ Machine-Generated Text Detection using Deep Learning + + +
+ Our research focuses on the crucial challenge of discerning text produced by +Large Language Models (LLMs) from human-generated text, which holds +significance for various applications. With ongoing discussions about attaining +a model with such functionality, we present supporting evidence regarding the +feasibility of such models. We evaluated our models on multiple datasets, +including Twitter Sentiment, Football Commentary, Project Gutenberg, PubMedQA, +and SQuAD, confirming the efficacy of the enhanced detection approaches. These +datasets were sampled with intricate constraints encompassing every +possibility, laying the foundation for future research. We evaluate +GPT-3.5-Turbo against various detectors such as SVM, RoBERTa-base, and +RoBERTa-large. Based on the research findings, the results predominantly relied +on the sequence length of the sentence. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Learning Section Weights for Multi-Label Document Classification + + +
+ Multi-label document classification is a traditional task in NLP. Compared to +single-label classification, each document can be assigned multiple classes. +This problem is crucially important in various domains, such as tagging +scientific articles. Documents are often structured into several sections such +as abstract and title. Current approaches treat different sections equally for +multi-label classification. We argue that this is not a realistic assumption, +leading to sub-optimal results. Instead, we propose a new method called +Learning Section Weights (LSW), leveraging the contribution of each distinct +section for multi-label classification. Via multiple feed-forward layers, LSW +learns to assign weights to each section of, and incorporate the weights in the +prediction. We demonstrate our approach on scientific articles. Experimental +results on public (arXiv) and private (Elsevier) datasets confirm the +superiority of LSW, compared to state-of-the-art multi-label document +classification methods. In particular, LSW achieves a 1.3% improvement in terms +of macro averaged F1-score while it achieves 1.3% in terms of macro averaged +recall on the publicly available arXiv dataset. + +
+
+ comment: 7 pages, 4 figures, 5 tables +
+
+
+
+
+ + ☆ Enhancing Empathetic and Emotion Support Dialogue Generation with + Prophetic Commonsense Inference + + +
+ The interest in Empathetic and Emotional Support conversations among the +public has significantly increased. To offer more sensitive and understanding +responses, leveraging commonsense knowledge has become a common strategy to +better understand psychological aspects and causality. However, such +commonsense inferences can be out of context and unable to predict upcoming +dialogue themes, resulting in responses that lack coherence and empathy. To +remedy this issue, we present Prophetic Commonsense Inference, an innovative +paradigm for inferring commonsense knowledge. By harnessing the capabilities of +Large Language Models in understanding dialogue and making commonsense +deductions, we train tunable models to bridge the gap between past and +potential future dialogues. Extensive experiments conducted on +EmpatheticDialogues and Emotion Support Conversation show that equipping +dialogue agents with our proposed prophetic commonsense inference significantly +enhances the quality of their responses. + +
+
+
+
+
+ + ☆ UHGEval: Benchmarking the Hallucination of Chinese Large Language Models + via Unconstrained Generation ICDE2024 + + +
+ Large language models (LLMs) have emerged as pivotal contributors in +contemporary natural language processing and are increasingly being applied +across a diverse range of industries. However, these large-scale probabilistic +statistical models cannot currently ensure the requisite quality in +professional content generation. These models often produce hallucinated text, +compromising their practical utility in professional contexts. To assess the +authentic reliability of LLMs in text generation, numerous initiatives have +developed benchmark evaluations for hallucination phenomena. Nevertheless, +these benchmarks frequently utilize constrained generation techniques due to +cost and temporal constraints. These techniques encompass the use of directed +hallucination induction and strategies that deliberately alter authentic text +to produce hallucinations. These approaches are not congruent with the +unrestricted text generation demanded by real-world applications. Furthermore, +a well-established Chinese-language dataset dedicated to the evaluation of +hallucinations in text generation is presently lacking. Consequently, we have +developed an Unconstrained Hallucination Generation Evaluation (UHGEval) +benchmark, designed to compile outputs produced with minimal restrictions by +LLMs. Concurrently, we have established a comprehensive benchmark evaluation +framework to aid subsequent researchers in undertaking scalable and +reproducible experiments. We have also executed extensive experiments, +evaluating prominent Chinese language models and the GPT series models to +derive professional performance insights regarding hallucination challenges. + +
+
+ comment: 13 Pages, submitted to ICDE2024 +
+
+
+
+
+ + ☆ Dataset for Stock Market Forecasting Based on Quantitative Analysis and + Qualitative Data + + +
+ The application of Machine learning to finance has become a familiar +approach, even more so in stock market forecasting. The stock market is highly +volatile and huge amounts of data are generated every minute globally. The +extraction of effective intelligence from this data is of critical importance. +However, a collaboration of numerical stock data with qualitative text data can +be a challenging task. In this work, we accomplish this and provide an +unprecedented, publicly available dataset with technical and fundamental data, +sentiment that we gathered from News Archives, TV news captions, Radio +Transcripts, Tweets, Daily financial newspapers, etc. The text data entries +used for sentiment extraction total more than 1.4 Million. The dataset +comprises of daily entries from January 2018 to December 2022 for 8 different +companies and Dow Jones Index as a whole. Holistic Fundamental and Technical +data is provided training ready for Model learning and deployment. The +predictive power of deep learning models is highly determined by the training +data provided. This dataset would be of benefit for research globally +incorporating qualitative intelligence for stock market forecasting. The +dataset is made available at https://github.com/batking24/Huge-Stock-Dataset. + +
+
+
+
+
+ + ☆ Probabilistic Transformer: A Probabilistic Dependency Model for + Contextual Word Representation ACL2023 + + +
+ Syntactic structures used to play a vital role in natural language processing +(NLP), but since the deep learning revolution, NLP has been gradually dominated +by neural models that do not consider syntactic structures in their design. One +vastly successful class of neural models is transformers. When used as an +encoder, a transformer produces contextual representation of words in the input +sentence. In this work, we propose a new model of contextual word +representation, not from a neural perspective, but from a purely syntactic and +probabilistic perspective. Specifically, we design a conditional random field +that models discrete latent representations of all words in a sentence as well +as dependency arcs between them; and we use mean field variational inference +for approximate inference. Strikingly, we find that the computation graph of +our model resembles transformers, with correspondences between dependencies and +self-attention and between distributions over latent representations and +contextual embeddings of words. Experiments show that our model performs +competitively to transformers on small to medium sized datasets. We hope that +our work could help bridge the gap between traditional syntactic and +probabilistic approaches and cutting-edge neural approaches to NLP, and inspire +more linguistically-principled neural approaches in the future. + +
+
+ comment: Accepted to ACL2023 Findings +
+
+
+
+
+ + ☆ LongStory: Coherent, Complete and Length Controlled Long story + Generation + + +
+ A human author can write any length of story without losing coherence. Also, +they always bring the story to a proper ending, an ability that current +language models lack. In this work, we present the LongStory for coherent, +complete, and length-controlled long story generation. LongStory introduces two +novel methodologies: (1) the long and short-term contexts weight calibrator +(CWC) and (2) long story structural positions (LSP). The CWC adjusts weights +for long-term context Memory and short-term context Cheating, acknowledging +their distinct roles. The LSP employs discourse tokens to convey the structural +positions of a long story. Trained on three datasets with varied average story +lengths, LongStory outperforms other baselines, including the strong story +generator Plotmachine, in coherence, completeness, relevance, and +repetitiveness. We also perform zero-shot tests on each dataset to assess the +model's ability to predict outcomes beyond its training data and validate our +methodology by comparing its performance with variants of our model. + +
+
+
+
+
+ + ☆ ChatGPT and Beyond: The Generative AI Revolution in Education + + +
+ The wide adoption and usage of generative artificial intelligence (AI) +models, particularly ChatGPT, has sparked a surge in research exploring their +potential applications in the educational landscape. This survey examines +academic literature published between November, 2022, and July, 2023, +specifically targeting high-impact research from Scopus-indexed Q1 and Q2 +journals. This survey delves into the practical applications and implications +of generative AI models across a diverse range of educational contexts. Through +a comprehensive and rigorous evaluation of recent academic literature, this +survey seeks to illuminate the evolving role of generative AI models, +particularly ChatGPT, in education. By shedding light on the potential +benefits, challenges, and emerging trends in this dynamic field, the survey +endeavors to contribute to the understanding of the nexus between artificial +intelligence and education. The findings of this review will empower educators, +researchers, and policymakers to make informed decisions about the integration +of AI technologies into learning environments. + +
+
+
+
+
+ + ☆ Benchmarking Large Language Model Volatility + + +
+ The impact of non-deterministic outputs from Large Language Models (LLMs) is +not well examined for financial text understanding tasks. Through a compelling +case study on investing in the US equity market via news sentiment analysis, we +uncover substantial variability in sentence-level sentiment classification +results, underscoring the innate volatility of LLM outputs. These uncertainties +cascade downstream, leading to more significant variations in portfolio +construction and return. While tweaking the temperature parameter in the +language model decoder presents a potential remedy, it comes at the expense of +stifled creativity. Similarly, while ensembling multiple outputs mitigates the +effect of volatile outputs, it demands a notable computational investment. This +work furnishes practitioners with invaluable insights for adeptly navigating +uncertainty in the integration of LLMs into financial decision-making, +particularly in scenarios dictated by non-deterministic information. + +
+
+ comment: 7 pages, 2 figures, Workshop on AI Safety and Robustness In Finance, + ICAIF 2023 +
+
+
+
+
+ + ♻ ☆ Conditional Adapters: Parameter-efficient Transfer Learning with Fast + Inference NeurIPS + + +
+ We propose Conditional Adapter (CoDA), a parameter-efficient transfer +learning method that also improves inference efficiency. CoDA generalizes +beyond standard adapter approaches to enable a new way of balancing speed and +accuracy using conditional computation. Starting with an existing dense +pretrained model, CoDA adds sparse activation together with a small number of +new parameters and a light-weight training phase. Our experiments demonstrate +that the CoDA approach provides an unexpectedly efficient way to transfer +knowledge. Across a variety of language, vision, and speech tasks, CoDA +achieves a 2x to 8x inference speed-up compared to the state-of-the-art Adapter +approaches with moderate to no accuracy loss and the same parameter efficiency. + +
+
+ comment: NeurIPS camera ready version +
+
+
+
+
+ + ♻ ☆ Sequential Monte Carlo Steering of Large Language Models using + Probabilistic Programs + + +
+ Even after fine-tuning and reinforcement learning, large language models +(LLMs) can be difficult, if not impossible, to control reliably with prompts +alone. We propose a new inference-time approach to enforcing syntactic and +semantic constraints on the outputs of LLMs, called sequential Monte Carlo +(SMC) steering. The key idea is to specify language generation tasks as +posterior inference problems in a class of discrete probabilistic sequence +models, and replace standard decoding with sequential Monte Carlo inference. +For a computational cost similar to that of beam search, SMC can steer LLMs to +solve diverse tasks, including infilling, generation under syntactic +constraints, and prompt intersection. To facilitate experimentation with SMC +steering, we present a probabilistic programming library, LLaMPPL +(https://github.com/probcomp/hfppl), for concisely specifying new generation +tasks as language model probabilistic programs, and automating steering of +LLaMA-family Transformers. + +
+
+ comment: Minor typo fixes +
+
+
+
+
+ + ♻ ☆ In-Context Impersonation Reveals Large Language Models' Strengths and + Biases NeurIPS 2023 + + +
+ In everyday conversations, humans can take on different roles and adapt their +vocabulary to their chosen roles. We explore whether LLMs can take on, that is +impersonate, different roles when they generate text in-context. We ask LLMs to +assume different personas before solving vision and language tasks. We do this +by prefixing the prompt with a persona that is associated either with a social +identity or domain expertise. In a multi-armed bandit task, we find that LLMs +pretending to be children of different ages recover human-like developmental +stages of exploration. In a language-based reasoning task, we find that LLMs +impersonating domain experts perform better than LLMs impersonating non-domain +experts. Finally, we test whether LLMs' impersonations are complementary to +visual information when describing different categories. We find that +impersonation can improve performance: an LLM prompted to be a bird expert +describes birds better than one prompted to be a car expert. However, +impersonation can also uncover LLMs' biases: an LLM prompted to be a man +describes cars better than one prompted to be a woman. These findings +demonstrate that LLMs are capable of taking on diverse roles and that this +in-context impersonation can be used to uncover their hidden strengths and +biases. + +
+
+ comment: Published in NeurIPS 2023 (Spotlight) +
+
+
+
+
+ + ♻ ☆ InferEM: Inferring the Speaker's Intention for Empathetic Dialogue + Generation + + +
+ Current approaches to empathetic response generation typically encode the +entire dialogue history directly and put the output into a decoder to generate +friendly feedback. These methods focus on modelling contextual information but +neglect capturing the direct intention of the speaker. We argue that the last +utterance in the dialogue empirically conveys the intention of the speaker. +Consequently, we propose a novel model named InferEM for empathetic response +generation. We separately encode the last utterance and fuse it with the entire +dialogue through the multi-head attention based intention fusion module to +capture the speaker's intention. Besides, we utilize previous utterances to +predict the last utterance, which simulates human's psychology to guess what +the interlocutor may speak in advance. To balance the optimizing rates of the +utterance prediction and response generation, a multi-task learning strategy is +designed for InferEM. Experimental results demonstrate the plausibility and +validity of InferEM in improving empathetic expression. + +
+
+ comment: Accepted by the 45th Annual Meeting of the Cognitive Science Society + (CogSci 2023) +
+
+
+
+
+ + ♻ ☆ Mirror: A Universal Framework for Various Information Extraction Tasks EMNLP23 + + +
+ Sharing knowledge between information extraction tasks has always been a +challenge due to the diverse data formats and task variations. Meanwhile, this +divergence leads to information waste and increases difficulties in building +complex applications in real scenarios. Recent studies often formulate IE tasks +as a triplet extraction problem. However, such a paradigm does not support +multi-span and n-ary extraction, leading to weak versatility. To this end, we +reorganize IE problems into unified multi-slot tuples and propose a universal +framework for various IE tasks, namely Mirror. Specifically, we recast existing +IE tasks as a multi-span cyclic graph extraction problem and devise a +non-autoregressive graph decoding algorithm to extract all spans in a single +step. It is worth noting that this graph structure is incredibly versatile, and +it supports not only complex IE tasks, but also machine reading comprehension +and classification tasks. We manually construct a corpus containing 57 datasets +for model pretraining, and conduct experiments on 30 datasets across 8 +downstream tasks. The experimental results demonstrate that our model has +decent compatibility and outperforms or reaches competitive performance with +SOTA systems under few-shot and zero-shot settings. The code, model weights, +and pretraining corpus are available at https://github.com/Spico197/Mirror . + +
+
+ comment: Accepted to EMNLP23 main conference +
+
+
+
+
+ + ♻ ☆ AI-Augmented Surveys: Leveraging Large Language Models and Surveys for + Opinion Prediction + + +
+ Large language models (LLMs) that produce human-like responses have begun to +revolutionize research practices in the social sciences. This paper shows how +we can integrate LLMs and social surveys to accurately predict individual +responses to survey questions that were not asked before. We develop a novel +methodological framework to personalize LLMs by considering the meaning of +survey questions derived from their text, the latent beliefs of individuals +inferred from their response patterns, and the temporal contexts across +different survey periods through fine-tuning LLMs with survey data. Using the +General Social Survey from 1972 to 2021, we show that the fine-tuned model +based on Alpaca-7b can predict individual responses to survey questions that +are partially missing as well as entirely missing. The remarkable prediction +capabilities allow us to fill in missing trends with high confidence and +pinpoint when public attitudes changed, such as the rising support for same-sex +marriage. We discuss practical constraints, socio-demographic representation, +and ethical concerns regarding individual autonomy and privacy when using LLMs +for opinion prediction. This study demonstrates that LLMs and surveys can +mutually enhance each other's capabilities: LLMs broaden survey potential, +while surveys improve the alignment of LLMs. + +
+
+
+
+
+ + ♻ ☆ Unveiling Public Perceptions: Machine Learning-Based Sentiment Analysis + of COVID-19 Vaccines in India + + +
+ In March 2020, the World Health Organisation declared COVID-19 a global +pandemic as it spread to nearly every country. By mid-2021, India had +introduced three vaccines: Covishield, Covaxin, and Sputnik. To ensure +successful vaccination in a densely populated country like India, understanding +public sentiment was crucial. Social media, particularly Reddit with over 430 +million users, played a vital role in disseminating information. This study +employs data mining techniques to analyze Reddit data and gauge Indian +sentiments towards COVID-19 vaccines. Using Python's Text Blob library, +comments are annotated to assess general sentiments. Results show that most +Reddit users in India expressed neutrality about vaccination, posing a +challenge for the Indian government's efforts to vaccinate a significant +portion of the population. + +
+
+
+
+
+ + ♻ ☆ Multimodal Document Analytics for Banking Process Automation + + +
+ Traditional banks face increasing competition from FinTechs in the rapidly +evolving financial ecosystem. Raising operational efficiency is vital to +address this challenge. Our study aims to improve the efficiency of +document-intensive business processes in banking. To that end, we first review +the landscape of business documents in the retail segment. Banking documents +often contain text, layout, and visuals, suggesting that document analytics and +process automation require more than plain natural language processing (NLP). +To verify this and assess the incremental value of visual cues when processing +business documents, we compare a recently proposed multimodal model called +LayoutXLM to powerful text classifiers (e.g., BERT) and large language models +(e.g., GPT) in a case study related to processing company register extracts. +The results confirm that incorporating layout information in a model +substantially increases its performance. Interestingly, we also observed that +more than 75% of the best model performance (in terms of the F1 score) can be +achieved with as little as 30% of the training data. This shows that the demand +for data labeled data to set up a multi-modal model can be moderate, which +simplifies real-world applications of multimodal document analytics. Our study +also sheds light on more specific practices in the scope of calibrating a +multimodal banking document classifier, including the need for fine-tuning. In +sum, the paper contributes original empirical evidence on the effectiveness and +efficiency of multi-model models for document processing in the banking +business and offers practical guidance on how to unlock this potential in +day-to-day operations. + +
+
+ comment: A Preprint +
+
+
+
+
+ + ♻ ☆ DocAsRef: An Empirical Study on Repurposing Reference-Based Summary + Quality Metrics Reference-Freely EMNLP 2023 + + +
+ Automated summary quality assessment falls into two categories: +reference-based and reference-free. Reference-based metrics, historically +deemed more accurate due to the additional information provided by +human-written references, are limited by their reliance on human input. In this +paper, we hypothesize that the comparison methodologies used by some +reference-based metrics to evaluate a system summary against its corresponding +reference can be effectively adapted to assess it against its source document, +thereby transforming these metrics into reference-free ones. Experimental +results support this hypothesis. After being repurposed reference-freely, the +zero-shot BERTScore using the pretrained DeBERTa-large-MNLI model of <0.5B +parameters consistently outperforms its original reference-based version across +various aspects on the SummEval and Newsroom datasets. It also excels in +comparison to most existing reference-free metrics and closely competes with +zero-shot summary evaluators based on GPT-3.5. + +
+
+ comment: Accepted into Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Autoregressive Language Models For Estimating the Entropy of Epic EHR + Audit Logs ML4H + + +
+ EHR audit logs are a highly granular stream of events that capture clinician +activities, and is a significant area of interest for research in +characterizing clinician workflow on the electronic health record (EHR). +Existing techniques to measure the complexity of workflow through EHR audit +logs (audit logs) involve time- or frequency-based cross-sectional aggregations +that are unable to capture the full complexity of a EHR session. We briefly +evaluate the usage of transformer-based tabular language model (tabular LM) in +measuring the entropy or disorderedness of action sequences within workflow and +release the evaluated models publicly. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 10 pages +
+
+
+
+
+ + ♻ ☆ PlanBench: An Extensible Benchmark for Evaluating Large Language Models + on Planning and Reasoning about Change NeurIPS 2023 + + +
+ Generating plans of action, and reasoning about change have long been +considered a core competence of intelligent agents. It is thus no surprise that +evaluating the planning and reasoning capabilities of large language models +(LLMs) has become a hot topic of research. Most claims about LLM planning +capabilities are however based on common sense tasks-where it becomes hard to +tell whether LLMs are planning or merely retrieving from their vast world +knowledge. There is a strong need for systematic and extensible planning +benchmarks with sufficient diversity to evaluate whether LLMs have innate +planning capabilities. Motivated by this, we propose PlanBench, an extensible +benchmark suite based on the kinds of domains used in the automated planning +community, especially in the International Planning Competition, to test the +capabilities of LLMs in planning or reasoning about actions and change. +PlanBench provides sufficient diversity in both the task domains and the +specific planning capabilities. Our studies also show that on many critical +capabilities-including plan generation-LLM performance falls quite short, even +with the SOTA models. PlanBench can thus function as a useful marker of +progress of LLMs in planning and reasoning. + +
+
+ comment: NeurIPS 2023 Track on Datasets and Benchmarks +
+
+
+
+
+
+
+
+ + Information Retrieval 5 + +
+
+
+ + ☆ Data Augmentation for Sample Efficient and Robust Document Ranking + + +
+ Contextual ranking models have delivered impressive performance improvements +over classical models in the document ranking task. However, these highly +over-parameterized models tend to be data-hungry and require large amounts of +data even for fine-tuning. In this paper, we propose data-augmentation methods +for effective and robust ranking performance. One of the key benefits of using +data augmentation is in achieving sample efficiency or learning effectively +when we have only a small amount of training data. We propose supervised and +unsupervised data augmentation schemes by creating training data using parts of +the relevant documents in the query-document pairs. We then adapt a family of +contrastive losses for the document ranking task that can exploit the augmented +data to learn an effective ranking model. Our extensive experiments on subsets +of the MS MARCO and TREC-DL test sets show that data augmentation, along with +the ranking-adapted contrastive losses, results in performance improvements +under most dataset sizes. Apart from sample efficiency, we conclusively show +that data augmentation results in robust models when transferred to +out-of-domain benchmarks. Our performance improvements in in-domain and more +prominently in out-of-domain benchmarks show that augmentation regularizes the +ranking model and improves its robustness and generalization capability. + +
+
+
+
+
+ + ☆ Query-LIFE: Query-aware Language Image Fusion Embedding for E-Commerce + Relevance + + +
+ Relevance module plays a fundamental role in e-commerce search as they are +responsible for selecting relevant products from thousands of items based on +user queries, thereby enhancing users experience and efficiency. The +traditional approach models the relevance based product titles and queries, but +the information in titles alone maybe insufficient to describe the products +completely. A more general optimization approach is to further leverage product +image information. In recent years, vision-language pre-training models have +achieved impressive results in many scenarios, which leverage contrastive +learning to map both textual and visual features into a joint embedding space. +In e-commerce, a common practice is to fine-tune on the pre-trained model based +on e-commerce data. However, the performance is sub-optimal because the +vision-language pre-training models lack of alignment specifically designed for +queries. In this paper, we propose a method called Query-LIFE (Query-aware +Language Image Fusion Embedding) to address these challenges. Query-LIFE +utilizes a query-based multimodal fusion to effectively incorporate the image +and title based on the product types. Additionally, it employs query-aware +modal alignment to enhance the accuracy of the comprehensive representation of +products. Furthermore, we design GenFilt, which utilizes the generation +capability of large models to filter out false negative samples and further +improve the overall performance of the contrastive learning task in the model. +Experiments have demonstrated that Query-LIFE outperforms existing baselines. +We have conducted ablation studies and human evaluations to validate the +effectiveness of each module within Query-LIFE. Moreover, Query-LIFE has been +deployed on Miravia Search, resulting in improved both relevance and conversion +efficiency. + +
+
+
+
+
+ + ♻ ☆ Typos-aware Bottlenecked Pre-Training for Robust Dense Retrieval SIGIR + + +
+ Current dense retrievers (DRs) are limited in their ability to effectively +process misspelled queries, which constitute a significant portion of query +traffic in commercial search engines. The main issue is that the pre-trained +language model-based encoders used by DRs are typically trained and fine-tuned +using clean, well-curated text data. Misspelled queries are typically not found +in the data used for training these models, and thus misspelled queries +observed at inference time are out-of-distribution compared to the data used +for training and fine-tuning. Previous efforts to address this issue have +focused on \textit{fine-tuning} strategies, but their effectiveness on +misspelled queries remains lower than that of pipelines that employ separate +state-of-the-art spell-checking components. To address this challenge, we +propose ToRoDer (TypOs-aware bottlenecked pre-training for RObust DEnse +Retrieval), a novel re-training strategy for DRs that increases their +robustness to misspelled queries while preserving their effectiveness in +downstream retrieval tasks. ToRoDer utilizes an encoder-decoder architecture +where the encoder takes misspelled text with masked tokens as input and outputs +bottlenecked information to the decoder. The decoder then takes as input the +bottlenecked embeddings, along with token embeddings of the original text with +the misspelled tokens masked out. The pre-training task is to recover the +masked tokens for both the encoder and decoder. Our extensive experimental +results and detailed ablation studies show that DRs pre-trained with ToRoDer +exhibit significantly higher effectiveness on misspelled queries, sensibly +closing the gap with pipelines that use a separate, complex spell-checker +component, while retaining their effectiveness on correctly spelled queries. + +
+
+ comment: 10 pages, accepted at SIGIR-AP +
+
+
+
+
+ + ♻ ☆ FLIP: Towards Fine-grained Alignment between ID-based Models and + Pretrained Language Models for CTR Prediction + + +
+ Click-through rate (CTR) prediction plays as a core function module in +various personalized online services. The traditional ID-based models for CTR +prediction take as inputs the one-hot encoded ID features of tabular modality, +which capture the collaborative signals via feature interaction modeling. But +the one-hot encoding discards the semantic information conceived in the +original feature texts. Recently, the emergence of Pretrained Language Models +(PLMs) has given rise to another paradigm, which takes as inputs the sentences +of textual modality obtained by hard prompt templates and adopts PLMs to +extract the semantic knowledge. However, PLMs generally tokenize the input text +data into subword tokens and ignore field-wise collaborative signals. +Therefore, these two lines of research focus on different characteristics of +the same input data (i.e., textual and tabular modalities), forming a distinct +complementary relationship with each other. In this paper, we propose to +conduct Fine-grained feature-level ALignment between ID-based Models and +Pretrained Language Models (FLIP) for CTR prediction. We design a novel joint +reconstruction pretraining task for both masked language and tabular modeling. +Specifically, the masked data of one modality (i.e., tokens or features) has to +be recovered with the help of the other modality, which establishes the +feature-level interaction and alignment via sufficient mutual information +extraction between dual modalities. Moreover, we propose to jointly finetune +the ID-based model and PLM for downstream CTR prediction tasks, thus achieving +superior performance by combining the advantages of both models. Extensive +experiments on three real-world datasets demonstrate that FLIP outperforms SOTA +baselines, and is highly compatible for various ID-based models and PLMs. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ A Topology-aware Analysis of Graph Collaborative Filtering + + +
+ The successful integration of graph neural networks into recommender systems +(RSs) has led to a novel paradigm in collaborative filtering (CF), graph +collaborative filtering (graph CF). By representing user-item data as an +undirected, bipartite graph, graph CF utilizes short- and long-range +connections to extract collaborative signals that yield more accurate user +preferences than traditional CF methods. Although the recent literature +highlights the efficacy of various algorithmic strategies in graph CF, the +impact of datasets and their topological features on recommendation performance +is yet to be studied. To fill this gap, we propose a topology-aware analysis of +graph CF. In this study, we (i) take some widely-adopted recommendation +datasets and use them to generate a large set of synthetic sub-datasets through +two state-of-the-art graph sampling methods, (ii) measure eleven of their +classical and topological characteristics, and (iii) estimate the accuracy +calculated on the generated sub-datasets considering four popular and recent +graph-based RSs (i.e., LightGCN, DGCF, UltraGCN, and SVD-GCN). Finally, the +investigation presents an explanatory framework that reveals the linear +relationships between characteristics and accuracy measures. The results, +statistically validated under different graph sampling settings, confirm the +existence of solid dependencies between topological characteristics and +accuracy in the graph-based recommendation, offering a new perspective on how +to interpret graph CF. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ GAIA: Zero-shot Talking Avatar Generation + + +
+ Zero-shot talking avatar generation aims at synthesizing natural talking +videos from speech and a single portrait image. Previous methods have relied on +domain-specific heuristics such as warping-based motion representation and 3D +Morphable Models, which limit the naturalness and diversity of the generated +avatars. In this work, we introduce GAIA (Generative AI for Avatar), which +eliminates the domain priors in talking avatar generation. In light of the +observation that the speech only drives the motion of the avatar while the +appearance of the avatar and the background typically remain the same +throughout the entire video, we divide our approach into two stages: 1) +disentangling each frame into motion and appearance representations; 2) +generating motion sequences conditioned on the speech and reference portrait +image. We collect a large-scale high-quality talking avatar dataset and train +the model on it with different scales (up to 2B parameters). Experimental +results verify the superiority, scalability, and flexibility of GAIA as 1) the +resulting model beats previous baseline models in terms of naturalness, +diversity, lip-sync quality, and visual quality; 2) the framework is scalable +since larger models yield better results; 3) it is general and enables +different applications like controllable talking avatar generation and +text-instructed avatar generation. + +
+
+ comment: Project page: https://microsoft.github.io/GAIA/ +
+
+
+
+
+ + ♻ ☆ Adding Conditional Control to Text-to-Image Diffusion Models + + +
+ We present ControlNet, a neural network architecture to add spatial +conditioning controls to large, pretrained text-to-image diffusion models. +ControlNet locks the production-ready large diffusion models, and reuses their +deep and robust encoding layers pretrained with billions of images as a strong +backbone to learn a diverse set of conditional controls. The neural +architecture is connected with "zero convolutions" (zero-initialized +convolution layers) that progressively grow the parameters from zero and ensure +that no harmful noise could affect the finetuning. We test various conditioning +controls, eg, edges, depth, segmentation, human pose, etc, with Stable +Diffusion, using single or multiple conditions, with or without prompts. We +show that the training of ControlNets is robust with small (<50k) and large +(>1m) datasets. Extensive results show that ControlNet may facilitate wider +applications to control image diffusion models. + +
+
+ comment: Codes and Supplementary Material: + https://github.com/lllyasviel/ControlNet +
+
+
+
+
+ + ♻ ☆ WavJourney: Compositional Audio Creation with Large Language Models + + +
+ Despite breakthroughs in audio generation models, their capabilities are +often confined to domain-specific conditions such as speech transcriptions and +audio captions. However, real-world audio creation aims to generate harmonious +audio containing various elements such as speech, music, and sound effects with +controllable conditions, which is challenging to address using existing audio +generation systems. We present WavJourney, a novel framework that leverages +Large Language Models (LLMs) to connect various audio models for audio +creation. WavJourney allows users to create storytelling audio content with +diverse audio elements simply from textual descriptions. Specifically, given a +text instruction, WavJourney first prompts LLMs to generate an audio script +that serves as a structured semantic representation of audio elements. The +audio script is then converted into a computer program, where each line of the +program calls a task-specific audio generation model or computational operation +function. The computer program is then executed to obtain a compositional and +interpretable solution for audio creation. Experimental results suggest that +WavJourney is capable of synthesizing realistic audio aligned with +textually-described semantic, spatial and temporal conditions, achieving +state-of-the-art results on text-to-audio generation benchmarks. Additionally, +we introduce a new multi-genre story benchmark. Subjective evaluations +demonstrate the potential of WavJourney in crafting engaging storytelling audio +content from text. We further demonstrate that WavJourney can facilitate +human-machine co-creation in multi-round dialogues. To foster future research, +the code and synthesized audio are available at: +https://audio-agi.github.io/WavJourney_demopage/. + +
+
+ comment: GitHub: https://github.com/Audio-AGI/WavJourney +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 23 + +
+
+
+ + ☆ Localizing Lying in Llama: Understanding Instructed Dishonesty on + True-False Questions Through Prompting, Probing, and Patching + + +
+ Large language models (LLMs) demonstrate significant knowledge through their +outputs, though it is often unclear whether false outputs are due to a lack of +knowledge or dishonesty. In this paper, we investigate instructed dishonesty, +wherein we explicitly prompt LLaMA-2-70b-chat to lie. We perform prompt +engineering to find which prompts best induce lying behavior, and then use +mechanistic interpretability approaches to localize where in the network this +behavior occurs. Using linear probing and activation patching, we localize five +layers that appear especially important for lying. We then find just 46 +attention heads within these layers that enable us to causally intervene such +that the lying model instead answers honestly. We show that these interventions +work robustly across many prompts and dataset splits. Overall, our work +contributes a greater understanding of dishonesty in LLMs so that we may hope +to prevent it. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ Relevance feedback strategies for recall-oriented neural information + retrieval + + +
+ In a number of information retrieval applications (e.g., patent search, +literature review, due diligence, etc.), preventing false negatives is more +important than preventing false positives. However, approaches designed to +reduce review effort (like "technology assisted review") can create false +negatives, since they are often based on active learning systems that exclude +documents automatically based on user feedback. Therefore, this research +proposes a more recall-oriented approach to reducing review effort. More +specifically, through iteratively re-ranking the relevance rankings based on +user feedback, which is also referred to as relevance feedback. In our proposed +method, the relevance rankings are produced by a BERT-based dense-vector search +and the relevance feedback is based on cumulatively summing the queried and +selected embeddings. Our results show that this method can reduce review effort +between 17.85% and 59.04%, compared to a baseline approach (of no feedback), +given a fixed recall target + +
+
+
+
+
+ + ☆ Solving the Right Problem is Key for Translational NLP: A Case Study in + UMLS Vocabulary Insertion EMNLP 2023 + + +
+ As the immense opportunities enabled by large language models become more +apparent, NLP systems will be increasingly expected to excel in real-world +settings. However, in many instances, powerful models alone will not yield +translational NLP solutions, especially if the formulated problem is not well +aligned with the real-world task. In this work, we study the case of UMLS +vocabulary insertion, an important real-world task in which hundreds of +thousands of new terms, referred to as atoms, are added to the UMLS, one of the +most comprehensive open-source biomedical knowledge bases. Previous work aimed +to develop an automated NLP system to make this time-consuming, costly, and +error-prone task more efficient. Nevertheless, practical progress in this +direction has been difficult to achieve due to a problem formulation and +evaluation gap between research output and the real-world task. In order to +address this gap, we introduce a new formulation for UMLS vocabulary insertion +which mirrors the real-world task, datasets which faithfully represent it and +several strong baselines we developed through re-purposing existing solutions. +Additionally, we propose an effective rule-enhanced biomedical language model +which enables important new model behavior, outperforms all strong baselines +and provides measurable qualitative improvements to editors who carry out the +UVI task. We hope this case study provides insight into the considerable +importance of problem formulation for the success of translational NLP +solutions. + +
+
+ comment: EMNLP 2023 Findings; Code is available at + https://github.com/OSU-NLP-Group/UMLS-Vocabulary-Insertion +
+
+
+
+
+ + ☆ Multilingual self-supervised speech representations improve the speech + recognition of low-resource African languages with codeswitching EMNLP 2023 + + +
+ While many speakers of low-resource languages regularly code-switch between +their languages and other regional languages or English, datasets of +codeswitched speech are too small to train bespoke acoustic models from scratch +or do language model rescoring. Here we propose finetuning self-supervised +speech representations such as wav2vec 2.0 XLSR to recognize code-switched +data. We find that finetuning self-supervised multilingual representations and +augmenting them with n-gram language models trained from transcripts reduces +absolute word error rates by up to 20% compared to baselines of hybrid models +trained from scratch on code-switched data. Our findings suggest that in +circumstances with limited training data finetuning self-supervised +representations is a better performing and viable solution. + +
+
+ comment: 5 pages, 1 figure. Computational Approaches to Linguistic + Code-Switching, CALCS 2023 (co-located with EMNLP 2023) +
+
+
+
+
+ + ☆ Automatically Finding and Categorizing Replication Studies + + +
+ In many fields of experimental science, papers that failed to replicate +continue to be cited as a result of the poor discoverability of replication +studies. As a first step to creating a system that automatically finds +replication studies for a given paper, 334 replication studies and 344 +replicated studies were collected. Replication studies could be identified in +the dataset based on text content at a higher rate than chance (AUROC = 0.886). + Additionally, successful replication studies could be distinguished from +failed replication studies at a higher rate than chance (AUROC = 0.664). + +
+
+
+
+
+ + ☆ Detection of developmental language disorder in Cypriot Greek children + using a machine learning neural network algorithm + + +
+ Children with developmental language disorder (DLD) encounter difficulties in +acquiring various language structures. Early identification and intervention +are crucial to prevent negative long-term outcomes impacting the academic, +social, and emotional development of children. The study aims to develop an +automated method for the identification of DLD using artificial intelligence, +specifically a neural network machine learning algorithm. This protocol is +applied for the first time in Cypriot Greek children, which is generally +considered underresearched in the context of DLD. The neural network model was +trained using perceptual and production data elicited from children with DLD +and healthy controls. The k-fold technique was used to crossvalidate the +algorithm. The performance of the model was evaluated using metrics such as +accuracy, precision, recall, F1 score, and ROC/AUC curve to assess its ability +to make accurate predictions on a set of unseen data. The results demonstrated +high classification values for all metrics (between 0.92 and 0.98), indicating +the high accuracy of the neural model in classifying children with DLD. +Additionally, the variable importance analysis revealed that the language +production skills of children had a more significant impact on the performance +of the model compared to perception skills. Neural networks represent powerful +tools for detecting DLD, providing early and quick assessments of the disorder, +and having the potential to improve clinical outcomes. + +
+
+ comment: 13 pages, 3 figures, journal article +
+
+
+
+
+ + ☆ nlpBDpatriots at BLP-2023 Task 2: A Transfer Learning Approach to Bangla + Sentiment Analysis + + +
+ In this paper, we discuss the nlpBDpatriots entry to the shared task on +Sentiment Analysis of Bangla Social Media Posts organized at the first workshop +on Bangla Language Processing (BLP) co-located with EMNLP. The main objective +of this task is to identify the polarity of social media content using a Bangla +dataset annotated with positive, neutral, and negative labels provided by the +shared task organizers. Our best system for this task is a transfer learning +approach with data augmentation which achieved a micro F1 score of 0.71. Our +best system ranked 12th among 30 teams that participated in the competition. + +
+
+
+
+
+ + ☆ nlpBDpatriots at BLP-2023 Task 1: A Two-Step Classification for Violence + Inciting Text Detection in Bangla + + +
+ In this paper, we discuss the nlpBDpatriots entry to the shared task on +Violence Inciting Text Detection (VITD) organized as part of the first workshop +on Bangla Language Processing (BLP) co-located with EMNLP. The aim of this task +is to identify and classify the violent threats, that provoke further unlawful +violent acts. Our best-performing approach for the task is two-step +classification using back translation and multilinguality which ranked 6th out +of 27 teams with a macro F1 score of 0.74. + +
+
+
+
+
+ + ☆ Offensive Language Identification in Transliterated and Code-Mixed + Bangla + + +
+ Identifying offensive content in social media is vital for creating safe +online communities. Several recent studies have addressed this problem by +creating datasets for various languages. In this paper, we explore offensive +language identification in texts with transliterations and code-mixing, +linguistic phenomena common in multilingual societies, and a known challenge +for NLP systems. We introduce TB-OLID, a transliterated Bangla offensive +language dataset containing 5,000 manually annotated comments. We train and +fine-tune machine learning models on TB-OLID, and we evaluate their results on +this dataset. Our results show that English pre-trained transformer-based +models, such as fBERT and HateBERT achieve the best performance on this +dataset. + +
+
+
+
+
+ + ☆ E-CORE: Emotion Correlation Enhanced Empathetic Dialogue Generation + + +
+ Achieving empathy is a crucial step toward humanized dialogue systems. +Current approaches for empathetic dialogue generation mainly perceive an +emotional label to generate an empathetic response conditioned on it, which +simply treat emotions independently, but ignore the intrinsic emotion +correlation in dialogues, resulting in inaccurate emotion perception and +unsuitable response generation. In this paper, we propose a novel emotion +correlation enhanced empathetic dialogue generation framework, which +comprehensively realizes emotion correlation learning, utilization, and +supervising. Specifically, a multi-resolution emotion graph is devised to +capture context-based emotion interactions from different resolutions, further +modeling emotion correlation. Then we propose an emotion correlation enhanced +decoder, with a novel correlation-aware aggregation and soft/hard strategy, +respectively improving the emotion perception and response generation. +Experimental results on the benchmark dataset demonstrate the superiority of +our model in both empathetic perception and expression. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ☆ Walking a Tightrope -- Evaluating Large Language Models in High-Risk + Domains EMNLP 2023 + + +
+ High-risk domains pose unique challenges that require language models to +provide accurate and safe responses. Despite the great success of large +language models (LLMs), such as ChatGPT and its variants, their performance in +high-risk domains remains unclear. Our study delves into an in-depth analysis +of the performance of instruction-tuned LLMs, focusing on factual accuracy and +safety adherence. To comprehensively assess the capabilities of LLMs, we +conduct experiments on six NLP datasets including question answering and +summarization tasks within two high-risk domains: legal and medical. Further +qualitative analysis highlights the existing limitations inherent in current +LLMs when evaluating in high-risk domains. This underscores the essential +nature of not only improving LLM capabilities but also prioritizing the +refinement of domain-specific metrics, and embracing a more human-centric +approach to enhance safety and factual reliability. Our findings advance the +field toward the concerns of properly evaluating LLMs in high-risk domains, +aiming to steer the adaptability of LLMs in fulfilling societal obligations and +aligning with forthcoming regulations, such as the EU AI Act. + +
+
+ comment: EMNLP 2023 Workshop on Benchmarking Generalisation in NLP (GenBench) +
+
+
+
+
+ + ☆ Vector-Quantized Prompt Learning for Paraphrase Generation EMNLP + + +
+ Deep generative modeling of natural languages has achieved many successes, +such as producing fluent sentences and translating from one language into +another. However, the development of generative modeling techniques for +paraphrase generation still lags behind largely due to the challenges in +addressing the complex conflicts between expression diversity and semantic +preservation. This paper proposes to generate diverse and high-quality +paraphrases by exploiting the pre-trained models with instance-dependent +prompts. To learn generalizable prompts, we assume that the number of abstract +transforming patterns of paraphrase generation (governed by prompts) is finite +and usually not large. Therefore, we present vector-quantized prompts as the +cues to control the generation of pre-trained models. Extensive experiments +demonstrate that the proposed method achieves new state-of-art results on three +benchmark datasets, including Quora, Wikianswers, and MSCOCO. We will release +all the code upon acceptance. + +
+
+ comment: EMNLP Findings, 2023 +
+
+
+
+
+ + ☆ Faster Minimum Bayes Risk Decoding with Confidence-based Pruning EMNLP 2023 + + +
+ Minimum Bayes risk (MBR) decoding outputs the hypothesis with the highest +expected utility over the model distribution for some utility function. It has +been shown to improve accuracy over beam search in conditional language +generation problems and especially neural machine translation, in both human +and automatic evaluations. However, the standard sampling-based algorithm for +MBR is substantially more computationally expensive than beam search, requiring +a large number of samples as well as a quadratic number of calls to the utility +function, limiting its applicability. We describe an algorithm for MBR which +gradually grows the number of samples used to estimate the utility while +pruning hypotheses that are unlikely to have the highest utility according to +confidence estimates obtained with bootstrap sampling. Our method requires +fewer samples and drastically reduces the number of calls to the utility +function compared to standard MBR while being statistically indistinguishable +in terms of accuracy. We demonstrate the effectiveness of our approach in +experiments on three language pairs, using chrF++ and COMET as +utility/evaluation metrics. + +
+
+ comment: Updated from EMNLP 2023 version: typo fix, minor math notation + change, updated citation +
+
+
+
+
+ + ☆ Code Search Debiasing:Improve Search Results beyond Overall Ranking + Performance EMNLP 2023 + + +
+ Code search engine is an essential tool in software development. Many code +search methods have sprung up, focusing on the overall ranking performance of +code search. In this paper, we study code search from another perspective by +analyzing the bias of code search models. Biased code search engines provide +poor user experience, even though they show promising overall performance. Due +to different development conventions (e.g., prefer long queries or +abbreviations), some programmers will find the engine useful, while others may +find it hard to get desirable search results. To mitigate biases, we develop a +general debiasing framework that employs reranking to calibrate search results. +It can be easily plugged into existing engines and handle new code search +biases discovered in the future. Experiments show that our framework can +effectively reduce biases. Meanwhile, the overall ranking performance of code +search gets improved after debiasing. + +
+
+ comment: Accepted to Findings of EMNLP 2023. 11 pages +
+
+
+
+
+ + ♻ ☆ IRFL: Image Recognition of Figurative Language + + +
+ Figures of speech such as metaphors, similes, and idioms are integral parts +of human communication. They are ubiquitous in many forms of discourse, +allowing people to convey complex, abstract ideas and evoke emotion. As +figurative forms are often conveyed through multiple modalities (e.g., both +text and images), understanding multimodal figurative language is an important +AI challenge, weaving together profound vision, language, commonsense and +cultural knowledge. In this work, we develop the Image Recognition of +Figurative Language (IRFL) dataset. We leverage human annotation and an +automatic pipeline we created to generate a multimodal dataset, and introduce +two novel tasks as a benchmark for multimodal figurative language +understanding. We experimented with state-of-the-art vision and language models +and found that the best (22%) performed substantially worse than humans (97%). +We release our dataset, benchmark, and code, in hopes of driving the +development of models that can better understand figurative language. + +
+
+
+
+
+ + ♻ ☆ The Impact of Data Corruption on Named Entity Recognition for + Low-resourced Languages + + +
+ Data availability and quality are major challenges in natural language +processing for low-resourced languages. In particular, there is significantly +less data available than for higher-resourced languages. This data is also +often of low quality, rife with errors, invalid text or incorrect annotations. +Many prior works focus on dealing with these problems, either by generating +synthetic data, or filtering out low-quality parts of datasets. We instead +investigate these factors more deeply, by systematically measuring the effect +of data quantity and quality on the performance of pre-trained language models +in a low-resourced setting. Our results show that having fewer +completely-labelled sentences is significantly better than having more +sentences with missing labels; and that models can perform remarkably well with +only 10% of the training data. Importantly, these results are consistent across +ten low-resource languages, English, and four pre-trained models. + +
+
+
+
+
+ + ♻ ☆ Evaluating Large Language Models: A Comprehensive Survey + + +
+ Large language models (LLMs) have demonstrated remarkable capabilities across +a broad spectrum of tasks. They have attracted significant attention and been +deployed in numerous downstream applications. Nevertheless, akin to a +double-edged sword, LLMs also present potential risks. They could suffer from +private data leaks or yield inappropriate, harmful, or misleading content. +Additionally, the rapid progress of LLMs raises concerns about the potential +emergence of superintelligent systems without adequate safeguards. To +effectively capitalize on LLM capacities as well as ensure their safe and +beneficial development, it is critical to conduct a rigorous and comprehensive +evaluation of LLMs. + This survey endeavors to offer a panoramic perspective on the evaluation of +LLMs. We categorize the evaluation of LLMs into three major groups: knowledge +and capability evaluation, alignment evaluation and safety evaluation. In +addition to the comprehensive review on the evaluation methodologies and +benchmarks on these three aspects, we collate a compendium of evaluations +pertaining to LLMs' performance in specialized domains, and discuss the +construction of comprehensive evaluation platforms that cover LLM evaluations +on capabilities, alignment, safety, and applicability. + We hope that this comprehensive overview will stimulate further research +interests in the evaluation of LLMs, with the ultimate goal of making +evaluation serve as a cornerstone in guiding the responsible development of +LLMs. We envision that this will channel their evolution into a direction that +maximizes societal benefit while minimizing potential risks. A curated list of +related papers has been publicly available at +https://github.com/tjunlp-lab/Awesome-LLMs-Evaluation-Papers. + +
+
+ comment: 111 pages +
+
+
+
+
+ + ♻ ☆ Text2Cohort: Facilitating Intuitive Access to Biomedical Data with + Natural Language Cohort Discovery + + +
+ The Imaging Data Commons (IDC) is a cloud-based database that provides +researchers with open access to cancer imaging data, with the goal of +facilitating collaboration. However, cohort discovery within the IDC database +has a significant technical learning curve. Recently, large language models +(LLM) have demonstrated exceptional utility for natural language processing +tasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate +user-friendly natural language cohort discovery in the IDC. Our method +translates user input into IDC queries using grounding techniques and returns +the query's response. We evaluate Text2Cohort on 50 natural language inputs, +from information extraction to cohort discovery. Our toolkit successfully +generated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that +Text2Cohort can enable researchers to discover and curate cohorts on IDC with +high levels of accuracy using natural language in a more intuitive and +user-friendly way. + +
+
+ comment: 5 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Trainable Noise Model as an XAI evaluation method: application on Sobol + for remote sensing image segmentation + + +
+ eXplainable Artificial Intelligence (XAI) has emerged as an essential +requirement when dealing with mission-critical applications, ensuring +transparency and interpretability of the employed black box AI models. The +significance of XAI spans various domains, from healthcare to finance, where +understanding the decision-making process of deep learning algorithms is +essential. Most AI-based computer vision models are often black boxes; hence, +providing explainability of deep neural networks in image processing is crucial +for their wide adoption and deployment in medical image analysis, autonomous +driving, and remote sensing applications. Recently, several XAI methods for +image classification tasks have been introduced. On the contrary, image +segmentation has received comparatively less attention in the context of +explainability, although it is a fundamental task in computer vision +applications, especially in remote sensing. Only some research proposes +gradient-based XAI algorithms for image segmentation. This paper adapts the +recent gradient-free Sobol XAI method for semantic segmentation. To measure the +performance of the Sobol method for segmentation, we propose a quantitative XAI +evaluation method based on a learnable noise model. The main objective of this +model is to induce noise on the explanation maps, where higher induced noise +signifies low accuracy and vice versa. A benchmark analysis is conducted to +evaluate and compare performance of three XAI methods, including Seg-Grad-CAM, +Seg-Grad-CAM++ and Seg-Sobol using the proposed noise-based evaluation +technique. This constitutes the first attempt to run and evaluate XAI methods +using high-resolution satellite images. + +
+
+
+
+
+ + ♻ ☆ OffMix-3L: A Novel Code-Mixed Dataset in Bangla-English-Hindi for + Offensive Language Identification + + +
+ Code-mixing is a well-studied linguistic phenomenon when two or more +languages are mixed in text or speech. Several works have been conducted on +building datasets and performing downstream NLP tasks on code-mixed data. +Although it is not uncommon to observe code-mixing of three or more languages, +most available datasets in this domain contain code-mixed data from only two +languages. In this paper, we introduce OffMix-3L, a novel offensive language +identification dataset containing code-mixed data from three different +languages. We experiment with several models on this dataset and observe that +BanglishBERT outperforms other transformer-based models and GPT-3.5. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2310.18023 +
+
+
+
+
+ + ♻ ☆ GRDD: A Dataset for Greek Dialectal NLP + + +
+ In this paper, we present a dataset for the computational study of a number +of Modern Greek dialects. It consists of raw text data from four dialects of +Modern Greek, Cretan, Pontic, Northern Greek and Cypriot Greek. The dataset is +of considerable size, albeit imbalanced, and presents the first attempt to +create large scale dialectal resources of this type for Modern Greek dialects. +We then use the dataset to perform dialect idefntification. We experiment with +traditional ML algorithms, as well as simple DL architectures. The results show +very good performance on the task, potentially revealing that the dialects in +question have distinct enough characteristics allowing even simple ML models to +perform well on the task. Error analysis is performed for the top performing +algorithms showing that in a number of cases the errors are due to insufficient +dataset cleaning. + +
+
+
+
+
+ + ♻ ☆ Semantic Parsing by Large Language Models for Intricate Updating + Strategies of Zero-Shot Dialogue State Tracking EMNLP 2023 + + +
+ Zero-shot Dialogue State Tracking (DST) addresses the challenge of acquiring +and annotating task-oriented dialogues, which can be time-consuming and costly. +However, DST extends beyond simple slot-filling and requires effective updating +strategies for tracking dialogue state as conversations progress. In this +paper, we propose ParsingDST, a new In-Context Learning (ICL) method, to +introduce additional intricate updating strategies in zero-shot DST. Our +approach reformulates the DST task by leveraging powerful Large Language Models +(LLMs) and translating the original dialogue text to JSON through semantic +parsing as an intermediate state. We also design a novel framework that +includes more modules to ensure the effectiveness of updating strategies in the +text-to-JSON process. Experimental results demonstrate that our approach +outperforms existing zero-shot DST methods on MultiWOZ, exhibiting significant +improvements in Joint Goal Accuracy (JGA) and slot accuracy compared to +existing ICL methods. Our code has been released. + +
+
+ comment: Accepted to the Findings of EMNLP 2023 (Short Paper) +
+
+
+
+
+ + ♻ ☆ Evaluating the Instruction-Following Robustness of Large Language Models + to Prompt Injection + + +
+ Large Language Models (LLMs) have demonstrated exceptional proficiency in +instruction-following, becoming increasingly crucial across various +applications. However, this capability brings with it the risk of prompt +injection attacks, where attackers inject instructions into LLMs' input to +elicit undesirable actions or content. Understanding the robustness of LLMs +against such attacks is vital for their safe implementation. In this work, we +establish a benchmark to evaluate the robustness of instruction-following LLMs +against prompt injection attacks. Our objective is to determine the extent to +which LLMs can be influenced by injected instructions and their ability to +differentiate between these injected and original target instructions. Through +extensive experiments with leading instruction-following LLMs, we uncover +significant vulnerabilities in their robustness to such attacks. Our results +indicate that some models are overly tuned to follow any embedded instructions +in the prompt, overly focusing on the latter parts of the prompt without fully +grasping the entire context. By contrast, models with a better grasp of the +context and instruction-following capabilities will potentially be more +susceptible to compromise by injected instructions. This underscores the need +to shift the focus from merely enhancing LLMs' instruction-following +capabilities to improving their overall comprehension of prompts and +discernment of instructions that are appropriate to follow. We hope our +in-depth analysis offers insights into the underlying causes of these +vulnerabilities, aiding in the development of future solutions. Code and data +are available at +https://github.com/Leezekun/instruction-following-robustness-eval + +
+
+ comment: The data and code can be found at + https://github.com/Leezekun/instruction-following-robustness-eval +
+
+
+
+
+
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ Hide Your Model: A Parameter Transmission-free Federated Recommender + System + + +
+ With the growing concerns regarding user data privacy, Federated Recommender +System (FedRec) has garnered significant attention recently due to its +privacy-preserving capabilities. Existing FedRecs generally adhere to a +learning protocol in which a central server shares a global recommendation +model with clients, and participants achieve collaborative learning by +frequently communicating the model's public parameters. Nevertheless, this +learning framework has two drawbacks that limit its practical usability: (1) It +necessitates a global-sharing recommendation model; however, in real-world +scenarios, information related to the recommender model, including its +algorithm and parameters, constitutes the platforms' intellectual property. +Hence, service providers are unlikely to release such information actively. (2) +The communication costs of model parameter transmission are expensive since the +model parameters are usually high-dimensional matrices. With the model size +increasing, the communication burden will be the bottleneck for such +traditional FedRecs. + Given the above limitations, this paper introduces a novel parameter +transmission-free federated recommendation framework that balances the +protection between users' data privacy and platforms' model privacy, namely +PTF-FedRec. Specifically, participants in PTF-FedRec collaboratively exchange +knowledge by sharing their predictions within a privacy-preserving mechanism. +Through this way, the central server can learn a recommender model without +disclosing its model parameters or accessing clients' raw data, preserving both +the server's model privacy and users' data privacy. Besides, since clients and +the central server only need to communicate prediction scores which are just a +few real numbers, the overhead is significantly reduced compared to traditional +FedRecs. + +
+
+
+
+
+ + ♻ ☆ Text2Cohort: Facilitating Intuitive Access to Biomedical Data with + Natural Language Cohort Discovery + + +
+ The Imaging Data Commons (IDC) is a cloud-based database that provides +researchers with open access to cancer imaging data, with the goal of +facilitating collaboration. However, cohort discovery within the IDC database +has a significant technical learning curve. Recently, large language models +(LLM) have demonstrated exceptional utility for natural language processing +tasks. We developed Text2Cohort, a LLM-powered toolkit to facilitate +user-friendly natural language cohort discovery in the IDC. Our method +translates user input into IDC queries using grounding techniques and returns +the query's response. We evaluate Text2Cohort on 50 natural language inputs, +from information extraction to cohort discovery. Our toolkit successfully +generated responses with an 88% accuracy and 0.94 F1 score. We demonstrate that +Text2Cohort can enable researchers to discover and curate cohorts on IDC with +high levels of accuracy using natural language in a more intuitive and +user-friendly way. + +
+
+ comment: 5 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ LFG: A Generative Network for Real-Time Recommendation + + +
+ Recommender systems are essential information technologies today, and +recommendation algorithms combined with deep learning have become a research +hotspot in this field. The recommendation model known as LFM (Latent Factor +Model), which captures latent features through matrix factorization and +gradient descent to fit user preferences, has given rise to various +recommendation algorithms that bring new improvements in recommendation +accuracy. However, collaborative filtering recommendation models based on LFM +lack flexibility and has shortcomings for real-time recommendations, as they +need to redo the matrix factorization and retrain using gradient descent when +new users arrive. In response to this, this paper innovatively proposes a +Latent Factor Generator (LFG) network, and set the movie recommendation as +research theme. The LFG dynamically generates user latent factors through deep +neural networks without the need for re-factorization or retrain. Experimental +results indicate that the LFG recommendation model outperforms traditional +matrix factorization algorithms in recommendation accuracy, providing an +effective solution to the challenges of real-time recommendations with LFM. + +
+
+ comment: 9 pages, 1 figure, 4 tables. Source code would be uploaded to github + soon +
+
+
+
+
+ + ♻ ☆ Intent Contrastive Learning with Cross Subsequences for Sequential + Recommendation WSDM2024 + + +
+ The user purchase behaviors are mainly influenced by their intentions (e.g., +buying clothes for decoration, buying brushes for painting, etc.). Modeling a +user's latent intention can significantly improve the performance of +recommendations. Previous works model users' intentions by considering the +predefined label in auxiliary information or introducing stochastic data +augmentation to learn purposes in the latent space. However, the auxiliary +information is sparse and not always available for recommender systems, and +introducing stochastic data augmentation may introduce noise and thus change +the intentions hidden in the sequence. Therefore, leveraging user intentions +for sequential recommendation (SR) can be challenging because they are +frequently varied and unobserved. In this paper, Intent contrastive learning +with Cross Subsequences for sequential Recommendation (ICSRec) is proposed to +model users' latent intentions. Specifically, ICSRec first segments a user's +sequential behaviors into multiple subsequences by using a dynamic sliding +operation and takes these subsequences into the encoder to generate the +representations for the user's intentions. To tackle the problem of no explicit +labels for purposes, ICSRec assumes different subsequences with the same target +item may represent the same intention and proposes a coarse-grain intent +contrastive learning to push these subsequences closer. Then, fine-grain intent +contrastive learning is mentioned to capture the fine-grain intentions of +subsequences in sequential behaviors. Extensive experiments conducted on four +real-world datasets demonstrate the superior performance of the proposed ICSRec +model compared with baseline methods. + +
+
+ comment: 10pages, 5figures, WSDM2024. arXiv admin note: text overlap with + arXiv:2304.07763 +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ☆ Weakly-Supervised Audio-Visual Segmentation + + +
+ Audio-visual segmentation is a challenging task that aims to predict +pixel-level masks for sound sources in a video. Previous work applied a +comprehensive manually designed architecture with countless pixel-wise accurate +masks as supervision. However, these pixel-level masks are expensive and not +available in all cases. In this work, we aim to simplify the supervision as the +instance-level annotation, i.e., weakly-supervised audio-visual segmentation. +We present a novel Weakly-Supervised Audio-Visual Segmentation framework, +namely WS-AVS, that can learn multi-scale audio-visual alignment with +multi-scale multiple-instance contrastive learning for audio-visual +segmentation. Extensive experiments on AVSBench demonstrate the effectiveness +of our WS-AVS in the weakly-supervised audio-visual segmentation of +single-source and multi-source scenarios. + +
+
+
+
+
+ + ☆ Incorporating granularity bias as the margin into contrastive loss for + video captioning + + +
+ Video captioning models easily suffer from long-tail distribution of phrases, +which makes captioning models prone to generate vague sentences instead of +accurate ones. However, existing debiasing strategies tend to export external +knowledge to build dependency trees of words or refine frequency distribution +by complex losses and extra input features, which lack interpretability and are +hard to train. To mitigate the impact of granularity bias on the model, we +introduced a statistical-based bias extractor. This extractor quantifies the +information content within sentences and videos, providing an estimate of the +likelihood that a video-sentence pair is affected by granularity bias. +Furthermore, with the growing trend of integrating contrastive learning methods +into video captioning tasks, we use a bidirectional triplet loss to get more +negative samples in a batch. Subsequently, we incorporate the margin score into +the contrastive learning loss, establishing distinct training objectives for +head and tail sentences. This approach facilitates the model's training +effectiveness on tail samples. Our simple yet effective loss, incorporating +Granularity bias, is referred to as the Margin-Contrastive Loss (GMC Loss). The +proposed model demonstrates state-of-the-art performance on MSRVTT with a CIDEr +of 57.17, and MSVD, where CIDEr reaches up to 138.68. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Vision-Language Instruction Tuning: A Review and Analysis + + +
+ Instruction tuning is a crucial supervised training phase in Large Language +Models (LLMs), aiming to enhance the LLM's ability to generalize instruction +execution and adapt to user preferences. With the increasing integration of +multi-modal data into LLMs, there is growing interest in Vision-Language +Instruction Tuning (VLIT), which presents more complex characteristics compared +to pure text instruction tuning. In this paper, we systematically review the +latest VLIT settings and corresponding datasets in multi-modal LLMs and provide +insights into the intrinsic motivations behind their design. For the first +time, we offer a detailed multi-perspective categorization for existing VLIT +datasets and identify the characteristics that high-quality VLIT data should +possess. By incorporating these characteristics as guiding principles into the +existing VLIT data construction process, we conduct extensive experiments and +verify their positive impact on the performance of tuned multi-modal LLMs. +Furthermore, we discuss the current challenges and future research directions +of VLIT, providing insights for the continuous development of this field. The +code and dataset related to this paper have been open-sourced at +https://github.com/palchenli/VL-Instruction-Tuning. + +
+
+ comment: 34 pages, 6 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 50 + +
+
+
+ + ☆ One Pass Streaming Algorithm for Super Long Token Attention + Approximation in Sublinear Space + + +
+ Deploying Large Language Models (LLMs) in streaming applications that involve +long contexts, particularly for extended dialogues and text analysis, is of +paramount importance but presents two significant challenges. Firstly, the +memory consumption is substantial during the decoding phase due to the caching +of Key and Value states (KV) of previous tokens. Secondly, attention +computation is time-consuming with a time complexity of $O(n^2)$ for the +generation of each token. In recent OpenAI DevDay (Nov 6, 2023), OpenAI +released a new model that is able to support a 128K-long document, in our +paper, we focus on the memory-efficient issue when context length $n$ is much +greater than 128K ($n \gg 2^d$). Considering a single-layer self-attention with +Query, Key, and Value matrices $Q, K, V \in \mathbb{R}^{n \times d}$, the +polynomial method approximates the attention output $T \in \mathbb{R}^{n \times +d}$. It accomplishes this by constructing $U_1, U_2 \in \mathbb{R}^{n \times +t}$ to expedite attention ${\sf Attn}(Q, K, V)$ computation within $n^{1+o(1)}$ +time executions. Despite this, storing the Key and Value matrices $K, V \in +\mathbb{R}^{n \times d}$ still necessitates $O( n d)$ space, leading to +significant memory usage. In response to these challenges, we introduce a new +algorithm that only reads one pass of the data in streaming fashion. This +method employs sublinear space $o(n)$ to store three sketch matrices, +alleviating the need for exact $K, V$ storage. Notably, our algorithm exhibits +exceptional memory-efficient performance with super-long tokens. As the token +length $n$ increases, our error guarantee diminishes while the memory usage +remains nearly constant. This unique attribute underscores the potential of our +technique in efficiently handling LLMs in streaming applications. + +
+
+
+
+
+ + ☆ Calibrated Language Models Must Hallucinate + + +
+ Recent language models have a mysterious tendency to generate false but +plausible-sounding text. Such "hallucinations" are an obstacle to the usability +of language-based AI systems and can harm people who rely upon their outputs. +This work shows shows that there is an inherent statistical reason that +pretrained language models hallucinate certain types of facts, having nothing +to do with the transformer LM architecture or data quality. For "arbitrary" +facts whose veracity cannot be determined from the training data, we show that +hallucination is necessary for language models that satisfy a statistical +calibration condition appropriate for generative language models. Specifically, +if the maximum probability of any fact is bounded, we show that the probability +of generating a hallucination is close to the fraction of facts that occur +exactly once in the training data (a "Good-Turing" estimate), even assuming +ideal training data without errors. + One conclusion is that models pretrained to be sufficiently good predictors +(i.e., calibrated) may require post-training to mitigate hallucinations on the +type of arbitrary facts that tend to appear once in the training set. However, +our analysis also suggests that there is no statistical reason that pretraining +will lead to hallucination on facts that tend to appear more than once in the +training data (like references to publications such as articles and books, +whose hallucinations have been particularly notable and problematic) or on +systematic facts (like arithmetic calculations). Therefore, different +architectures and learning algorithms may mitigate these latter types of +hallucinations. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ GPT Struct Me: Probing GPT Models on Narrative Entity Extraction + + +
+ The importance of systems that can extract structured information from +textual data becomes increasingly pronounced given the ever-increasing volume +of text produced on a daily basis. Having a system that can effectively extract +such information in an interoperable manner would be an asset for several +domains, be it finance, health, or legal. Recent developments in natural +language processing led to the production of powerful language models that can, +to some degree, mimic human intelligence. Such effectiveness raises a pertinent +question: Can these models be leveraged for the extraction of structured +information? In this work, we address this question by evaluating the +capabilities of two state-of-the-art language models -- GPT-3 and GPT-3.5, +commonly known as ChatGPT -- in the extraction of narrative entities, namely +events, participants, and temporal expressions. This study is conducted on the +Text2Story Lusa dataset, a collection of 119 Portuguese news articles whose +annotation framework includes a set of entity structures along with several +tags and attribute values. We first select the best prompt template through an +ablation study over prompt components that provide varying degrees of +information on a subset of documents of the dataset. Subsequently, we use the +best templates to evaluate the effectiveness of the models on the remaining +documents. The results obtained indicate that GPT models are competitive with +out-of-the-box baseline systems, presenting an all-in-one alternative for +practitioners with limited resources. By studying the strengths and limitations +of these models in the context of information extraction, we offer insights +that can guide future improvements and avenues to explore in this field. + +
+
+
+
+
+ + ☆ Data-Efficient Alignment of Large Language Models with Human Feedback + Through Natural Language NeurIPS 2023 + + +
+ Learning from human feedback is a prominent technique to align the output of +large language models (LLMs) with human expectations. Reinforcement learning +from human feedback (RLHF) leverages human preference signals that are in the +form of ranking of response pairs to perform this alignment. However, human +preference on LLM outputs can come in much richer forms including natural +language, which may provide detailed feedback on strengths and weaknesses of a +given response. In this work we investigate data efficiency of modeling human +feedback that is in natural language. Specifically, we fine-tune an open-source +LLM, e.g., Falcon-40B-Instruct, on a relatively small amount (1000 records or +even less) of human feedback in natural language in the form of critiques and +revisions of responses. We show that this model is able to improve the quality +of responses from even some of the strongest LLMs such as ChatGPT, BARD, and +Vicuna, through critique and revision of those responses. For instance, through +one iteration of revision of ChatGPT responses, the revised responses have +56.6% win rate over the original ones, and this win rate can be further +improved to 65.9% after applying the revision for five iterations. + +
+
+ comment: Accepted by Workshop on Instruction Tuning and Instruction Following + at NeurIPS 2023, Submitted to AAAI 2024 +
+
+
+
+
+ + ☆ CMed-GPT: Prompt Tuning for Entity-Aware Chinese Medical Dialogue + Generation + + +
+ Medical dialogue generation relies on natural language generation techniques +to enable online medical consultations. Recently, the widespread adoption of +large-scale models in the field of natural language processing has facilitated +rapid advancements in this technology. Existing medical dialogue models are +mostly based on BERT and pre-trained on English corpora, but there is a lack of +high-performing models on the task of Chinese medical dialogue generation. To +solve the above problem, this paper proposes CMed-GPT, which is the GPT +pre-training language model based on Chinese medical domain text. The model is +available in two versions, namely, base and large, with corresponding +perplexity values of 8.64 and 8.01. Additionally, we incorporate lexical and +entity embeddings into the dialogue text in a uniform manner to meet the +requirements of downstream dialogue generation tasks. By applying both +fine-tuning and p-tuning to CMed-GPT, we lowered the PPL from 8.44 to 7.35. +This study not only confirms the exceptional performance of the CMed-GPT model +in generating Chinese biomedical text but also highlights the advantages of +p-tuning over traditional fine-tuning with prefix prompts. Furthermore, we +validate the significance of incorporating external information in medical +dialogue generation, which enhances the quality of dialogue generation. + +
+
+
+
+
+ + ☆ Machine Translation for Ge'ez Language + + +
+ Machine translation (MT) for low-resource languages such as Ge'ez, an ancient +language that is no longer spoken in daily life, faces challenges such as +out-of-vocabulary words, domain mismatches, and lack of sufficient labeled +training data. In this work, we explore various methods to improve Ge'ez MT, +including transfer-learning from related languages, optimizing shared +vocabulary and token segmentation approaches, finetuning large pre-trained +models, and using large language models (LLMs) for few-shot translation with +fuzzy matches. We develop a multilingual neural machine translation (MNMT) +model based on languages relatedness, which brings an average performance +improvement of about 4 BLEU compared to standard bilingual models. We also +attempt to finetune the NLLB-200 model, one of the most advanced translation +models available today, but find that it performs poorly with only 4k training +samples for Ge'ez. Furthermore, we experiment with using GPT-3.5, a +state-of-the-art LLM, for few-shot translation with fuzzy matches, which +leverages embedding similarity-based retrieval to find context examples from a +parallel corpus. We observe that GPT-3.5 achieves a remarkable BLEU score of +9.2 with no initial knowledge of Ge'ez, but still lower than the MNMT baseline +of 15.2. Our work provides insights into the potential and limitations of +different approaches for low-resource and ancient language MT. + +
+
+ comment: 10 pages, 1 figure +
+
+
+
+
+ + ☆ tinyCLAP: Distilling Constrastive Language-Audio Pretrained Models + + +
+ Contrastive Language-Audio Pretraining (CLAP) became of crucial importance in +the field of audio and speech processing. Its employment ranges from sound +event detection to text-to-audio generation. However, one of the main +limitations is the considerable amount of data required in the training process +and the overall computational complexity during inference. This paper +investigates how we can reduce the complexity of contrastive language-audio +pre-trained models, yielding an efficient model that we call tinyCLAP. We +derive an unimodal distillation loss from first principles and explore how the +dimensionality of the shared, multimodal latent space can be reduced via +pruning. TinyCLAP uses only 6% of the original Microsoft CLAP parameters with a +minimal reduction (less than 5%) in zero-shot classification performance across +the three sound event detection datasets on which it was tested + +
+
+
+
+
+ + ☆ Analysing the Impact of Removing Infrequent Words on Topic Quality in + LDA Models + + +
+ An initial procedure in text-as-data applications is text preprocessing. One +of the typical steps, which can substantially facilitate computations, consists +in removing infrequent words believed to provide limited information about the +corpus. Despite popularity of vocabulary pruning, not many guidelines on how to +implement it are available in the literature. The aim of the paper is to fill +this gap by examining the effects of removing infrequent words for the quality +of topics estimated using Latent Dirichlet Allocation. The analysis is based on +Monte Carlo experiments taking into account different criteria for infrequent +terms removal and various evaluation metrics. The results indicate that pruning +is beneficial and that the share of vocabulary which might be eliminated can be +quite considerable. + +
+
+
+
+
+ + ☆ StableSSM: Alleviating the Curse of Memory in State-space Models through + Stable Reparameterization + + +
+ In this paper, we investigate the long-term memory learning capabilities of +state-space models (SSMs) from the perspective of parameterization. We prove +that state-space models without any reparameterization exhibit a memory +limitation similar to that of traditional RNNs: the target relationships that +can be stably approximated by state-space models must have an exponential +decaying memory. Our analysis identifies this "curse of memory" as a result of +the recurrent weights converging to a stability boundary, suggesting that a +reparameterization technique can be effective. To this end, we introduce a +class of reparameterization techniques for SSMs that effectively lift its +memory limitations. Besides improving approximation capabilities, we further +illustrate that a principled choice of reparameterization scheme can also +enhance optimization stability. We validate our findings using synthetic +datasets and language models. + +
+
+
+
+
+ + ☆ SER_AMPEL: A multi-source dataset for SER of Italian older adults + + +
+ In this paper, SER_AMPEL, a multi-source dataset for speech emotion +recognition (SER) is presented. The peculiarity of the dataset is that it is +collected with the aim of providing a reference for speech emotion recognition +in case of Italian older adults. The dataset is collected following different +protocols, in particular considering acted conversations, extracted from movies +and TV series, and recording natural conversations where the emotions are +elicited by proper questions. The evidence of the need for such a dataset +emerges from the analysis of the state of the art. Preliminary considerations +on the critical issues of SER are reported analyzing the classification results +on a subset of the proposed dataset. + +
+
+ comment: 11 pages, 1 Figure, 7 Tables, submitted to ForItAAL 2023 (12{\deg} + Forum Italiano Ambient Assisted Living) +
+
+
+
+
+ + ☆ Controlled Text Generation via Language Model Arithmetic + + +
+ As Large Language Models (LLMs) are deployed more widely, customization with +respect to vocabulary, style and character becomes more important. In this work +we introduce model arithmetic, a novel inference framework for composing and +biasing LLMs without the need for model (re)training or highly specific +datasets. In addition, the framework allows for more precise control of +generated text than direct prompting and prior controlled text generation (CTG) +techniques. Using model arithmetic, we can express prior CTG techniques as +simple formulas and naturally extend them to new and more effective +formulations. Further, we show that speculative sampling, a technique for +efficient LLM sampling, extends to our setting. This enables highly efficient +text generation with multiple composed models with only marginal overhead over +a single model. Our empirical evaluation demonstrates that model arithmetic +allows fine-grained control of generated text while outperforming +state-of-the-art on the task of toxicity reduction. + +
+
+
+
+
+ + ☆ DP-NMT: Scalable Differentially-Private Machine Translation + + +
+ Neural machine translation (NMT) is a widely popular text generation task, +yet there is a considerable research gap in the development of +privacy-preserving NMT models, despite significant data privacy concerns for +NMT systems. Differentially private stochastic gradient descent (DP-SGD) is a +popular method for training machine learning models with concrete privacy +guarantees; however, the implementation specifics of training a model with +DP-SGD are not always clarified in existing models, with differing software +libraries used and code bases not always being public, leading to +reproducibility issues. To tackle this, we introduce DP-NMT, an open-source +framework for carrying out research on privacy-preserving NMT with DP-SGD, +bringing together numerous models, datasets, and evaluation metrics in one +systematic software package. Our goal is to provide a platform for researchers +to advance the development of privacy-preserving NMT systems, keeping the +specific details of the DP-SGD algorithm transparent and intuitive to +implement. We run a set of experiments on datasets from both general and +privacy-related domains to demonstrate our framework in use. We make our +framework publicly available and welcome feedback from the community. + +
+
+
+
+
+ + ☆ Universal Jailbreak Backdoors from Poisoned Human Feedback + + +
+ Reinforcement Learning from Human Feedback (RLHF) is used to align large +language models to produce helpful and harmless responses. Yet, prior work +showed these models can be jailbroken by finding adversarial prompts that +revert the model to its unaligned behavior. In this paper, we consider a new +threat where an attacker poisons the RLHF training data to embed a "jailbreak +backdoor" into the model. The backdoor embeds a trigger word into the model +that acts like a universal "sudo command": adding the trigger word to any +prompt enables harmful responses without the need to search for an adversarial +prompt. Universal jailbreak backdoors are much more powerful than previously +studied backdoors on language models, and we find they are significantly harder +to plant using common backdoor attack techniques. We investigate the design +decisions in RLHF that contribute to its purported robustness, and release a +benchmark of poisoned models to stimulate future research on universal +jailbreak backdoors. + +
+
+
+
+
+ + ☆ ÚFAL CorPipe at CRAC 2023: Larger Context Improves Multilingual + Coreference Resolution + + +
+ We present CorPipe, the winning entry to the CRAC 2023 Shared Task on +Multilingual Coreference Resolution. Our system is an improved version of our +earlier multilingual coreference pipeline, and it surpasses other participants +by a large margin of 4.5 percent points. CorPipe first performs mention +detection, followed by coreference linking via an antecedent-maximization +approach on the retrieved spans. Both tasks are trained jointly on all +available corpora using a shared pretrained language model. Our main +improvements comprise inputs larger than 512 subwords and changing the mention +decoding to support ensembling. The source code is available at +https://github.com/ufal/crac2023-corpipe. + +
+
+ comment: Accepted to CRAC 2023 (the Sixth Workshop on Computational Models of + Reference, Anaphora and Coreference) +
+
+
+
+
+ + ☆ Average Token Delay: A Duration-aware Latency Metric for Simultaneous + Translation INTERSPEECH 2023 + + +
+ Simultaneous translation is a task in which the translation begins before the +end of an input speech segment. Its evaluation should be conducted based on +latency in addition to quality, and for users, the smallest possible amount of +latency is preferable. Most existing metrics measure latency based on the start +timings of partial translations and ignore their duration. This means such +metrics do not penalize the latency caused by long translation output, which +delays the comprehension of users and subsequent translations. In this work, we +propose a novel latency evaluation metric for simultaneous translation called +\emph{Average Token Delay} (ATD) that focuses on the duration of partial +translations. We demonstrate its effectiveness through analyses simulating +user-side latency based on Ear-Voice Span (EVS). In our experiment, ATD had the +highest correlation with EVS among baseline latency metrics under most +conditions. + +
+
+ comment: Extended version of the paper (doi: 10.21437/Interspeech.2023-933) + which appeared in INTERSPEECH 2023 +
+
+
+
+
+ + ☆ Large Language Models as Topological Structure Enhancers for + Text-Attributed Graphs + + +
+ The latest advancements in large language models (LLMs) have revolutionized +the field of natural language processing (NLP). Inspired by the success of LLMs +in NLP tasks, some recent work has begun investigating the potential of +applying LLMs in graph learning tasks. However, most of the existing work +focuses on utilizing LLMs as powerful node feature augmenters, leaving +employing LLMs to enhance graph topological structures an understudied problem. +In this work, we explore how to leverage the information retrieval and text +generation capabilities of LLMs to refine/enhance the topological structure of +text-attributed graphs (TAGs) under the node classification setting. First, we +propose using LLMs to help remove unreliable edges and add reliable ones in the +TAG. Specifically, we first let the LLM output the semantic similarity between +node attributes through delicate prompt designs, and then perform edge deletion +and edge addition based on the similarity. Second, we propose using +pseudo-labels generated by the LLM to improve graph topology, that is, we +introduce the pseudo-label propagation as a regularization to guide the graph +neural network (GNN) in learning proper edge weights. Finally, we incorporate +the two aforementioned LLM-based methods for graph topological refinement into +the process of GNN training, and perform extensive experiments on four +real-world datasets. The experimental results demonstrate the effectiveness of +LLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain +on public benchmarks). + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Tracing Influence at Scale: A Contrastive Learning Approach to Linking + Public Comments and Regulator Responses + + +
+ U.S. Federal Regulators receive over one million comment letters each year +from businesses, interest groups, and members of the public, all advocating for +changes to proposed regulations. These comments are believed to have +wide-ranging impacts on public policy. However, measuring the impact of +specific comments is challenging because regulators are required to respond to +comments but they do not have to specify which comments they are addressing. In +this paper, we propose a simple yet effective solution to this problem by using +an iterative contrastive method to train a neural model aiming for matching +text from public comments to responses written by regulators. We demonstrate +that our proposal substantially outperforms a set of selected text-matching +baselines on a human-annotated test set. Furthermore, it delivers performance +comparable to the most advanced gigantic language model (i.e., GPT-4), and is +more cost-effective when handling comments and regulator responses matching in +larger scale. + +
+
+ comment: Accepted to the Natural Legal Language Processing Workshop 2023 (NLLP + 2023) +
+
+
+
+
+ + ☆ Improving Cross-Domain Hate Speech Generalizability with Emotion + Knowledge ACL + + +
+ Reliable automatic hate speech (HS) detection systems must adapt to the +in-flow of diverse new data to curtail hate speech. However, hate speech +detection systems commonly lack generalizability in identifying hate speech +dissimilar to data used in training, impeding their robustness in real-world +deployments. In this work, we propose a hate speech generalization framework +that leverages emotion knowledge in a multitask architecture to improve the +generalizability of hate speech detection in a cross-domain setting. We +investigate emotion corpora with varying emotion categorical scopes to +determine the best corpus scope for supplying emotion knowledge to foster +generalized hate speech detection. We further assess the relationship between +using pretrained Transformers models adapted for hate speech and its effect on +our emotion-enriched hate speech generalization model. We perform extensive +experiments on six publicly available datasets sourced from different online +domains and show that our emotion-enriched HS detection generalization method +demonstrates consistent generalization improvement in cross-domain evaluation, +increasing generalization performance up to 18.1% and average cross-domain +performance up to 8.5%, according to the F1 measure. + +
+
+ comment: Accepted to Pacific Asia Conference on Language, Information and + Computation (PACLIC 37) +
+
+
+
+
+ + ☆ OpusCleaner and OpusTrainer, open source toolkits for training Machine + Translation and Large language models + + +
+ Developing high quality machine translation systems is a labour intensive, +challenging and confusing process for newcomers to the field. We present a pair +of tools OpusCleaner and OpusTrainer that aim to simplify the process, reduce +the amount of work and lower the entry barrier for newcomers. + OpusCleaner is a data downloading, cleaning, and proprocessing toolkit. It is +designed to allow researchers to quickly download, visualise and preprocess +bilingual (or monolingual) data that comes from many different sources, each of +them with different quality, issues, and unique filtering/preprocessing +requirements. + OpusTrainer is a data scheduling and data augmenting tool aimed at building +large scale, robust machine translation systems and large language models. It +features deterministic data mixing from many different sources, on-the-fly data +augmentation and more. + Using these tools, we showcase how we can use it to create high quality +machine translation model robust to noisy user input; multilingual models and +terminology aware models. + +
+
+ comment: Code on Github: https://github.com/hplt-project/OpusCleaner and + https://github.com/hplt-project/OpusTrainer +
+
+
+
+
+ + ☆ Custom Data Augmentation for low resource ASR using Bark and + Retrieval-Based Voice Conversion + + +
+ This paper proposes two innovative methodologies to construct customized +Common Voice datasets for low-resource languages like Hindi. The first +methodology leverages Bark, a transformer-based text-to-audio model developed +by Suno, and incorporates Meta's enCodec and a pre-trained HuBert model to +enhance Bark's performance. The second methodology employs Retrieval-Based +Voice Conversion (RVC) and uses the Ozen toolkit for data preparation. Both +methodologies contribute to the advancement of ASR technology and offer +valuable insights into addressing the challenges of constructing customized +Common Voice datasets for under-resourced languages. Furthermore, they provide +a pathway to achieving high-quality, personalized voice generation for a range +of applications. + +
+
+
+
+
+ + ☆ Weak Alignment Supervision from Hybrid Model Improves End-to-end ASR + + +
+ In this paper, we aim to create weak alignment supervision to aid the +end-to-end modeling. Towards this end, we use the existing hybrid ASR system to +produce triphone alignments of the training audios. We then create a +cross-entropy loss at a certain layer of the encoder using the derived +alignments. In contrast to the general one-hot cross-entropy losses with or +without loss weighting, here we use a cross-entropy loss with a label smoothing +parameter to regularize the supervision. As a comparison, we also conduct the +experiments with one-hot cross-entropy losses and CTC losses with loss +weighting. The results show that placing the weak alignment supervision with +the label smoothing parameter of 0.5 at the third encoder layer outperforms the +other two approaches and leads to about 5% relative WER reduction on the +TED-LIUM 2 dataset over the baseline. We see similar improvements when applying +the method out-of-the-box on a Tagalog end-to-end ASR system. + +
+
+ comment: 7 pages, 7 figures, and 5 tables +
+
+
+
+
+ + ☆ Data-to-Text Bilingual Generation + + +
+ This document illustrates the use of pyrealb for generating two parallel +texts (English and French) from a single source of data. The data selection and +text organisation processes are shared between the two languages. only language +dependent word and phrasing choices are distinct processes. The realized texts +thus convey identical information in both languages without the risk of being +lost in translation. This is especially important in cases where strict and +simultaneous bilingualism is required. We first present the types of +applications targeted by this approach and how the pyrealb English and French +realizer can be used for achieving this goal in a natural way. We describe an +object-oriented organization to ensure a convenient realization in both +languages. To illustrate the process, different types of applications are then +briefly sketched with links to the source code. A brief comparison of the text +generation is given with the output of an instance of a GPT. + +
+
+ comment: 30 pages +
+
+
+
+
+ + ☆ Evaluating Large Language Models through Gender and Racial Stereotypes + + +
+ Language Models have ushered a new age of AI gaining traction within the NLP +community as well as amongst the general population. AI's ability to make +predictions, generations and its applications in sensitive decision-making +scenarios, makes it even more important to study these models for possible +biases that may exist and that can be exaggerated. We conduct a quality +comparative study and establish a framework to evaluate language models under +the premise of two kinds of biases: gender and race, in a professional setting. +We find out that while gender bias has reduced immensely in newer models, as +compared to older ones, racial bias still exists. + +
+
+ comment: 8 pages, 12 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ She had Cobalt Blue Eyes: Prompt Testing to Create Aligned and + Sustainable Language Models + + +
+ As the use of large language models (LLMs) increases within society, as does +the risk of their misuse. Appropriate safeguards must be in place to ensure LLM +outputs uphold the ethical standards of society, highlighting the positive role +that artificial intelligence technologies can have. Recent events indicate +ethical concerns around conventionally trained LLMs, leading to overall unsafe +user experiences. This motivates our research question: how do we ensure LLM +alignment? In this work, we introduce a test suite of unique prompts to foster +the development of aligned LLMs that are fair, safe, and robust. We show that +prompting LLMs at every step of the development pipeline, including data +curation, pre-training, and fine-tuning, will result in an overall more +responsible model. Our test suite evaluates outputs from four state-of-the-art +language models: GPT-3.5, GPT-4, OPT, and LLaMA-2. The assessment presented in +this paper highlights a gap between societal alignment and the capabilities of +current LLMs. Additionally, implementing a test suite such as ours lowers the +environmental overhead of making models safe and fair. + +
+
+
+
+
+ + ♻ ☆ CrossGET: Cross-Guided Ensemble of Tokens for Accelerating + Vision-Language Transformers + + +
+ Recent vision-language models have achieved tremendous progress far beyond +what we ever expected. However, their computational costs are also dramatically +growing with rapid development, especially for the large models. It makes model +acceleration exceedingly critical in a scenario of limited resources. Although +extensively studied for unimodal models, the acceleration for multimodal +models, especially the vision-language Transformers, is relatively +under-explored. To pursue more efficient and accessible vision-language +Transformers, this paper introduces \textbf{Cross}-\textbf{G}uided +\textbf{E}nsemble of \textbf{T}okens (\textbf{\emph{CrossGET}}), a universal +acceleration framework for vision-language Transformers. This framework +adaptively combines tokens through real-time, cross-modal guidance, thereby +achieving substantial acceleration while keeping high performance. +\textit{CrossGET} has two key innovations: 1) \textit{Cross-Guided Matching and +Ensemble}. \textit{CrossGET} incorporates cross-modal guided token matching and +ensemble to exploit cross-modal information effectively, only introducing +cross-modal tokens with negligible extra parameters. 2) \textit{Complete-Graph +Soft Matching}. In contrast to the existing bipartite soft matching approach, +\textit{CrossGET} introduces a complete-graph soft matching policy to achieve +more reliable token-matching results while maintaining parallelizability and +high efficiency. Extensive experiments are conducted on various vision-language +tasks, including image-text retrieval, visual reasoning, image captioning, and +visual question answering. Performance on both classic multimodal architectures +and emerging multimodal LLMs demonstrate the effectiveness and versatility of +the proposed \textit{CrossGET} framework. The code will be at +\url{https://github.com/sdc17/CrossGET}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ Backdoor Activation Attack: Attack Large Language Models using + Activation Steering for Safety-Alignment + + +
+ To ensure AI safety, instruction-tuned Large Language Models (LLMs) are +specifically trained to ensure alignment, which refers to making models behave +in accordance with human intentions. While these models have demonstrated +commendable results on various safety benchmarks, the vulnerability of their +safety alignment has not been extensively studied. This is particularly +troubling given the potential harm that LLMs can inflict. Existing attack +methods on LLMs often rely on poisoned training data or the injection of +malicious prompts. These approaches compromise the stealthiness and +generalizability of the attacks, making them susceptible to detection. +Additionally, these models often demand substantial computational resources for +implementation, making them less practical for real-world applications. +Inspired by recent success in modifying model behavior through steering vectors +without the need for optimization, and drawing on its effectiveness in +red-teaming LLMs, we conducted experiments employing activation steering to +target four key aspects of LLMs: truthfulness, toxicity, bias, and harmfulness +- across a varied set of attack settings. To establish a universal attack +strategy applicable to diverse target alignments without depending on manual +analysis, we automatically select the intervention layer based on contrastive +layer search. Our experiment results show that activation attacks are highly +effective and add little or no overhead to attack efficiency. Additionally, we +discuss potential countermeasures against such activation attacks. Our code and +data are available at https://github.com/wang2226/Backdoor-Activation-Attack +Warning: this paper contains content that can be offensive or upsetting. + +
+
+
+
+
+ + ♻ ☆ Monkey: Image Resolution and Text Label Are Important Things for Large + Multi-modal Models + + +
+ Large Multimodal Models (LMMs) have shown promise in vision-language tasks +but struggle with high-resolution input and detailed scene understanding. +Addressing these challenges, we introduce Monkey to enhance LMM capabilities. +Firstly, Monkey processes input images by dividing them into uniform patches, +each matching the size (e.g., 448x448) used in the original training of the +well-trained vision encoder. Equipped with individual adapter for each patch, +Monkey can handle higher resolutions up to 1344x896 pixels, enabling the +detailed capture of complex visual information. Secondly, it employs a +multi-level description generation method, enriching the context for +scene-object associations. This two-part strategy ensures more effective +learning from generated data: the higher resolution allows for a more detailed +capture of visuals, which in turn enhances the effectiveness of comprehensive +descriptions. Extensive ablative results validate the effectiveness of our +designs. Additionally, experiments on 18 datasets further demonstrate that +Monkey surpasses existing LMMs in many tasks like Image Captioning and various +Visual Question Answering formats. Specially, in qualitative tests focused on +dense text question answering, Monkey has exhibited encouraging results +compared with GPT4V. Code is available at +https://github.com/Yuliang-Liu/Monkey. + +
+
+
+
+
+ + ♻ ☆ tieval: An Evaluation Framework for Temporal Information Extraction + Systems + + +
+ Temporal information extraction (TIE) has attracted a great deal of interest +over the last two decades, leading to the development of a significant number +of datasets. Despite its benefits, having access to a large volume of corpora +makes it difficult when it comes to benchmark TIE systems. On the one hand, +different datasets have different annotation schemes, thus hindering the +comparison between competitors across different corpora. On the other hand, the +fact that each corpus is commonly disseminated in a different format requires a +considerable engineering effort for a researcher/practitioner to develop +parsers for all of them. This constraint forces researchers to select a limited +amount of datasets to evaluate their systems which consequently limits the +comparability of the systems. Yet another obstacle that hinders the +comparability of the TIE systems is the evaluation metric employed. While most +research works adopt traditional metrics such as precision, recall, and $F_1$, +a few others prefer temporal awareness -- a metric tailored to be more +comprehensive on the evaluation of temporal systems. Although the reason for +the absence of temporal awareness in the evaluation of most systems is not +clear, one of the factors that certainly weights this decision is the necessity +to implement the temporal closure algorithm in order to compute temporal +awareness, which is not straightforward to implement neither is currently +easily available. All in all, these problems have limited the fair comparison +between approaches and consequently, the development of temporal extraction +systems. To mitigate these problems, we have developed tieval, a Python library +that provides a concise interface for importing different corpora and +facilitates system evaluation. In this paper, we present the first public +release of tieval and highlight its most relevant features. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Accurate Speech Emotion Recognition + + +
+ Contrastive cross-modality pretraining has recently exhibited impressive +success in diverse fields, whereas there is limited research on their merits in +speech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind +of gender-attribute-enhanced contrastive language-audio pretraining (CLAP) +method for SER. Specifically, we first construct an effective emotion CLAP +(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given +the significance of gender information in SER, two novel multi-task learning +based GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP) +models are further proposed to incorporate gender information of speech +signals, forming more reasonable objectives. Experiments on IEMOCAP indicate +that our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with +different pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP +obtains the best UAR of 81.43\% and WAR of 83.16\%, which performs better than +state-of-the-art SER methods. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ Revisiting Large Language Models as Zero-shot Relation Extractors EMNLP 2023 + + +
+ Relation extraction (RE) consistently involves a certain degree of labeled or +unlabeled data even if under zero-shot setting. Recent studies have shown that +large language models (LLMs) transfer well to new tasks out-of-the-box simply +given a natural language prompt, which provides the possibility of extracting +relations from text without any data and parameter tuning. This work focuses on +the study of exploring LLMs, such as ChatGPT, as zero-shot relation extractors. +On the one hand, we analyze the drawbacks of existing RE prompts and attempt to +incorporate recent prompt techniques such as chain-of-thought (CoT) to improve +zero-shot RE. We propose the summarize-and-ask (\textsc{SumAsk}) prompting, a +simple prompt recursively using LLMs to transform RE inputs to the effective +question answering (QA) format. On the other hand, we conduct comprehensive +experiments on various benchmarks and settings to investigate the capabilities +of LLMs on zero-shot RE. Specifically, we have the following findings: (i) +\textsc{SumAsk} consistently and significantly improves LLMs performance on +different model sizes, benchmarks and settings; (ii) Zero-shot prompting with +ChatGPT achieves competitive or superior results compared with zero-shot and +fully supervised methods; (iii) LLMs deliver promising performance in +extracting overlapping relations; (iv) The performance varies greatly regarding +different relations. Different from small language models, LLMs are effective +in handling challenge none-of-the-above (NoTA) relation. + +
+
+ comment: Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ MAUD: An Expert-Annotated Legal NLP Dataset for Merger Agreement + Understanding EMNLP 2023 + + +
+ Reading comprehension of legal text can be a particularly challenging task +due to the length and complexity of legal clauses and a shortage of +expert-annotated datasets. To address this challenge, we introduce the Merger +Agreement Understanding Dataset (MAUD), an expert-annotated reading +comprehension dataset based on the American Bar Association's 2021 Public +Target Deal Points Study, with over 39,000 examples and over 47,000 total +annotations. Our fine-tuned Transformer baselines show promising results, with +models performing well above random on most questions. However, on a large +subset of questions, there is still room for significant improvement. As the +only expert-annotated merger agreement dataset, MAUD is valuable as a benchmark +for both the legal profession and the NLP community. + +
+
+ comment: EMNLP 2023. 5 pages + appendix. Code and dataset are available at + https://github.com/TheAtticusProject/maud +
+
+
+
+
+ + ♻ ☆ Scalable and Transferable Black-Box Jailbreaks for Language Models via + Persona Modulation + + +
+ Despite efforts to align large language models to produce harmless responses, +they are still vulnerable to jailbreak prompts that elicit unrestricted +behaviour. In this work, we investigate persona modulation as a black-box +jailbreaking method to steer a target model to take on personalities that are +willing to comply with harmful instructions. Rather than manually crafting +prompts for each persona, we automate the generation of jailbreaks using a +language model assistant. We demonstrate a range of harmful completions made +possible by persona modulation, including detailed instructions for +synthesising methamphetamine, building a bomb, and laundering money. These +automated attacks achieve a harmful completion rate of 42.5% in GPT-4, which is +185 times larger than before modulation (0.23%). These prompts also transfer to +Claude 2 and Vicuna with harmful completion rates of 61.0% and 35.9%, +respectively. Our work reveals yet another vulnerability in commercial large +language models and highlights the need for more comprehensive safeguards. + +
+
+
+
+
+ + ♻ ☆ VISIT: Visualizing and Interpreting the Semantic Information Flow of + Transformers EMNLP + + +
+ Recent advances in interpretability suggest we can project weights and hidden +states of transformer-based language models (LMs) to their vocabulary, a +transformation that makes them more human interpretable. In this paper, we +investigate LM attention heads and memory values, the vectors the models +dynamically create and recall while processing a given input. By analyzing the +tokens they represent through this projection, we identify patterns in the +information flow inside the attention mechanism. Based on our discoveries, we +create a tool to visualize a forward pass of Generative Pre-trained +Transformers (GPTs) as an interactive flow graph, with nodes representing +neurons or hidden states and edges representing the interactions between them. +Our visualization simplifies huge amounts of data into easy-to-read plots that +can reflect the models' internal processing, uncovering the contribution of +each component to the models' final prediction. Our visualization also unveils +new insights about the role of layer norms as semantic filters that influence +the models' output, and about neurons that are always activated during forward +passes and act as regularization vectors. + +
+
+ comment: EMNLP Findings 2023 +
+
+
+
+
+ + ♻ ☆ CultureBERT: Measuring Corporate Culture With Transformer-Based Language + Models + + +
+ This paper introduces supervised machine learning to the literature measuring +corporate culture from text documents. We compile a unique data set of employee +reviews that were labeled by human evaluators with respect to the information +the reviews reveal about the firms' corporate culture. Using this data set, we +fine-tune state-of-the-art transformer-based language models to perform the +same classification task. In out-of-sample predictions, our language models +classify 16 to 28 percent points more of employee reviews in line with human +evaluators than traditional approaches of text classification. We make our +models publicly available. + +
+
+ comment: 19 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ PACuna: Automated Fine-Tuning of Language Models for Particle + Accelerators + + +
+ Navigating the landscape of particle accelerators has become increasingly +challenging with recent surges in contributions. These intricate devices +challenge comprehension, even within individual facilities. To address this, we +introduce PACuna, a fine-tuned language model refined through publicly +available accelerator resources like conferences, pre-prints, and books. We +automated data collection and question generation to minimize expert +involvement and make the data publicly available. PACuna demonstrates +proficiency in addressing intricate accelerator questions, validated by +experts. Our approach shows adapting language models to scientific domains by +fine-tuning technical texts and auto-generated corpora capturing the latest +developments can further produce pre-trained models to answer some intricate +questions that commercially available assistants cannot and can serve as +intelligent assistants for individual facilities. + +
+
+
+
+
+ + ♻ ☆ InstructERC: Reforming Emotion Recognition in Conversation with a + Retrieval Multi-task LLMs Framework + + +
+ The development of emotion recognition in dialogue (ERC) has been +consistently hindered by the complexity of pipeline designs, leading to ERC +models that often overfit to specific datasets and dialogue patterns. In this +study, we propose a novel approach, namely + InstructERC, to reformulates the ERC task from a discriminative framework to +a generative framework based on Large Language Models (LLMs) . InstructERC has +two significant contributions: Firstly, InstructERC introduces a simple yet +effective retrieval template module, which helps the model explicitly integrate +multi-granularity dialogue supervision information by concatenating the +historical dialog content, label statement, and emotional domain demonstrations +with high semantic similarity. Furthermore, we introduce two additional emotion +alignment tasks, namely speaker identification and emotion prediction tasks, to +implicitly model the dialogue role relationships and future emotional +tendencies in conversations. Our LLM-based plug-and-play plugin framework +significantly outperforms all previous models and achieves comprehensive SOTA +on three commonly used ERC datasets. Extensive analysis of parameter-efficient +and data-scaling experiments provide empirical guidance for applying +InstructERC in practical scenarios. Our code will be released after blind +review. + +
+
+
+
+
+ + ♻ ☆ ÚFAL CorPipe at CRAC 2022: Effectivity of Multilingual Models for + Coreference Resolution + + +
+ We describe the winning submission to the CRAC 2022 Shared Task on +Multilingual Coreference Resolution. Our system first solves mention detection +and then coreference linking on the retrieved spans with an +antecedent-maximization approach, and both tasks are fine-tuned jointly with +shared Transformer weights. We report results of fine-tuning a wide range of +pretrained models. The center of this contribution are fine-tuned multilingual +models. We found one large multilingual model with sufficiently large encoder +to increase performance on all datasets across the board, with the benefit not +limited only to the underrepresented languages or groups of typologically +relative languages. The source code is available at +https://github.com/ufal/crac2022-corpipe. + +
+
+ comment: Accepted to CRAC 2022 (Fifth Workshop on Computational Models of + Reference, Anaphora and Coreference) +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: This paper integrates the works arXiv:2306.01129 and arXiv:2308.16271 + into a complete story. In this paper, we improve the writing and + organization, and also add conceptual, empirical, and theoretical + improvements over the previous work. V2: small typo fixes and formatting + improvements +
+
+
+
+
+ + ♻ ☆ DUMA: a Dual-Mind Conversational Agent with Fast and Slow Thinking + + +
+ Inspired by the dual-process theory of human cognition, we introduce DUMA, a +novel conversational agent framework that embodies a dual-mind mechanism +through the utilization of two generative Large Language Models (LLMs) +dedicated to fast and slow thinking respectively. The fast thinking model +serves as the primary interface for external interactions and initial response +generation, evaluating the necessity for engaging the slow thinking model based +on the complexity of the complete response. When invoked, the slow thinking +model takes over the conversation, engaging in meticulous planning, reasoning, +and tool utilization to provide a well-analyzed response. This dual-mind +configuration allows for a seamless transition between intuitive responses and +deliberate problem-solving processes based on the situation. We have +constructed a conversational agent to handle online inquiries in the real +estate industry. The experiment proves that our method balances effectiveness +and efficiency, and has a significant improvement compared to the baseline. + +
+
+
+
+
+ + ♻ ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ♻ ☆ Think-on-Graph: Deep and Responsible Reasoning of Large Language Model + on Knowledge Graph + + +
+ Although large language models (LLMs) have achieved significant success in +various tasks, they often struggle with hallucination problems, especially in +scenarios requiring deep and responsible reasoning. These issues could be +partially addressed by introducing external knowledge graphs (KG) in LLM +reasoning. In this paper, we propose a new LLM-KG integrating paradigm +``$\hbox{LLM}\otimes\hbox{KG}$'' which treats the LLM as an agent to +interactively explore related entities and relations on KGs and perform +reasoning based on the retrieved knowledge. We further implement this paradigm +by introducing a new approach called Think-on-Graph (ToG), in which the LLM +agent iteratively executes beam search on KG, discovers the most promising +reasoning paths, and returns the most likely reasoning results. We use a number +of well-designed experiments to examine and illustrate the following advantages +of ToG: 1) compared with LLMs, ToG has better deep reasoning power; 2) ToG has +the ability of knowledge traceability and knowledge correctability by +leveraging LLMs reasoning and expert feedback; 3) ToG provides a flexible +plug-and-play framework for different LLMs, KGs and prompting strategies +without any additional training cost; 4) the performance of ToG with small LLM +models could exceed large LLM such as GPT-4 in certain scenarios and this +reduces the cost of LLM deployment and application. As a training-free method +with lower computational cost and better generality, ToG achieves overall SOTA +in 6 out of 9 datasets where most previous SOTAs rely on additional training. + +
+
+ comment: 30 pages, 13 figures, 20 tables +
+
+
+
+
+ + ♻ ☆ Input Reconstruction Attack against Vertical Federated Large Language + Models + + +
+ Recently, large language models (LLMs) have drawn extensive attention from +academia and the public, due to the advent of the ChatGPT. While LLMs show +their astonishing ability in text generation for various tasks, privacy +concerns limit their usage in real-life businesses. More specifically, either +the user's inputs (the user sends the query to the model-hosting server) or the +model (the user downloads the complete model) itself will be revealed during +the usage. Vertical federated learning (VFL) is a promising solution to this +kind of problem. It protects both the user's input and the knowledge of the +model by splitting the model into a bottom part and a top part, which is +maintained by the user and the model provider, respectively. However, in this +paper, we demonstrate that in LLMs, VFL fails to protect the user input since +it is simple and cheap to reconstruct the input from the intermediate +embeddings. Experiments show that even with a commercial GPU, the input +sentence can be reconstructed in only one second. We also discuss several +possible solutions to enhance the privacy of vertical federated LLMs. + +
+
+
+
+
+ + ♻ ☆ Reward Dropout Improves Control: Bi-objective Perspective on Reinforced + LM + + +
+ We study the theoretical aspects of Reinforced Language Models (RLMs) from a +bi-objective optimization perspective. Specifically, we consider the RLMs as a +Pareto optimization problem that maximizes the two conflicting objectives, +i.e., reward objective and likelihood objectives, simultaneously. Our main +contribution consists of three parts. First, we establish the theoretical +foundations of RLM as a Pareto optimization problem by presenting Reward Upper +BOund (RUBO) and Pareto optimality. Our theoretical outcomes are supported by +not only deductive proofs but also empirical results. Second, we propose Reward +Dropout, a simple yet powerful method that guarantees to improve a bi-objective +optimization of RLM. Lastly, we demonstrate that the Reward Dropout is +consistently effective across five benchmark datasets and four benchmark LLMs, +meaning that the Reward Dropout significantly improves the optimization +performance of RLMs. + +
+
+ comment: 29 pages, 13 figures, conference +
+
+
+
+
+ + ♻ ☆ Cultural and Linguistic Diversity Improves Visual Representations + + +
+ Computer vision often treats perception as objective, and this assumption +gets reflected in the way that datasets are collected and models are trained. +For instance, image descriptions in different languages are typically assumed +to be translations of the same semantic content. However, work in +cross-cultural psychology and linguistics has shown that individuals differ in +their visual perception depending on their cultural background and the language +they speak. In this paper, we demonstrate significant differences in semantic +content across languages in both dataset and model-produced captions. When data +is multilingual as opposed to monolingual, captions have higher semantic +coverage on average, as measured by scene graph, embedding, and linguistic +complexity. For example, multilingual captions have on average 21.8% more +objects, 24.5% more relations, and 27.1% more attributes than a set of +monolingual captions. Moreover, models trained on content from different +languages perform best against test data from those languages, while those +trained on multilingual content perform consistently well across all evaluation +data compositions. Our research provides implications for how diverse modes of +perception can improve image understanding. + +
+
+
+
+
+ + ♻ ☆ Is Prompt All You Need? No. A Comprehensive and Broader View of + Instruction Learning + + +
+ Task semantics can be expressed by a set of input-to-output examples or a +piece of textual instruction. Conventional machine learning approaches for +natural language processing (NLP) mainly rely on the availability of +large-scale sets of task-specific examples. Two issues arise: first, collecting +task-specific labeled examples does not apply to scenarios where tasks may be +too complicated or costly to annotate, or the system is required to handle a +new task immediately; second, this is not user-friendly since end-users are +probably more willing to provide task description rather than a set of examples +before using the system. Therefore, the community is paying increasing interest +in a new supervision-seeking paradigm for NLP: learning from task instructions. +Despite its impressive progress, there are some common issues that the +community struggles with. This survey paper tries to summarize and provide +insights into the current research on instruction learning, particularly by +answering the following questions: (i) What is task instruction, and what +instruction types exist? (ii) How to model instructions? (iii) What factors +influence and explain the instructions' performance? (iv) What challenges +remain in instruction learning? To our knowledge, this is the first +comprehensive survey about textual instructions. + +
+
+ comment: Preprint. The paper list is available at + https://github.com/RenzeLou/awesome-instruction-learning +
+
+
+
+
+ + ♻ ☆ Soft Random Sampling: A Theoretical and Empirical Analysis + + +
+ Soft random sampling (SRS) is a simple yet effective approach for efficient +training of large-scale deep neural networks when dealing with massive data. +SRS selects a subset uniformly at random with replacement from the full data +set in each epoch. In this paper, we conduct a theoretical and empirical +analysis of SRS. First, we analyze its sampling dynamics including data +coverage and occupancy. Next, we investigate its convergence with non-convex +objective functions and give the convergence rate. Finally, we provide its +generalization performance. We empirically evaluate SRS for image recognition +on CIFAR10 and automatic speech recognition on Librispeech and an in-house +payload dataset to demonstrate its effectiveness. Compared to existing +coreset-based data selection methods, SRS offers a better accuracy-efficiency +trade-off. Especially on real-world industrial scale data sets, it is shown to +be a powerful training strategy with significant speedup and competitive +performance with almost no additional computing cost. + +
+
+
+
+
+ + ♻ ☆ Proving Test Set Contamination in Black Box Language Models + + +
+ Large language models are trained on vast amounts of internet data, prompting +concerns and speculation that they have memorized public benchmarks. Going from +speculation to proof of contamination is challenging, as the pretraining data +used by proprietary models are often not publicly accessible. We show that it +is possible to provide provable guarantees of test set contamination in +language models without access to pretraining data or model weights. Our +approach leverages the fact that when there is no data contamination, all +orderings of an exchangeable benchmark should be equally likely. In contrast, +the tendency for language models to memorize example order means that a +contaminated language model will find certain canonical orderings to be much +more likely than others. Our test flags potential contamination whenever the +likelihood of a canonically ordered benchmark dataset is significantly higher +than the likelihood after shuffling the examples. We demonstrate that our +procedure is sensitive enough to reliably prove test set contamination in +challenging situations, including models as small as 1.4 billion parameters, on +small test sets of only 1000 examples, and datasets that appear only a few +times in the pretraining corpus. Using our test, we audit five popular publicly +accessible language models for test set contamination and find little evidence +for pervasive contamination. + +
+
+
+
+
+ + ♻ ☆ Analyzing Zero-Shot Abilities of Vision-Language Models on Video + Understanding Tasks + + +
+ Foundational multimodal models pre-trained on large scale image-text pairs or +video-text pairs or both have shown strong generalization abilities on +downstream tasks. However unlike image-text models, pretraining video-text +models is always not feasible due to the difficulty in collecting large-scale +clean and aligned data, and exponential computational costs involved in the +pretraining phase. Therefore, the pertinent question to ask is: Can image-text +models be adapted to video tasks and is there any benefit to using these models +over pretraining directly on videos? In this work, we focus on this question by +proposing a detailed study on the generalization abilities of image-text models +when evaluated on video understanding tasks in a zero-shot setting. We +investigate 9 foundational image-text models on a diverse set of video tasks +that include video action recognition (video AR), video retrieval (video RT), +video question answering (video QA), video multiple choice (video MC) and video +captioning (video CP). Our experiments show that image-text models exhibit +impressive performance on video AR, video RT and video MC. Furthermore, they +perform moderately on video captioning and poorly on video QA. These findings +shed a light on the benefits of adapting foundational image-text models to an +array of video tasks while avoiding the costly pretraining step. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Graph Attention Autoencoder for Attributed Networks using + K-means Loss + + +
+ Several natural phenomena and complex systems are often represented as +networks. Discovering their community structure is a fundamental task for +understanding these networks. Many algorithms have been proposed, but recently, +Graph Neural Networks (GNN) have emerged as a compelling approach for enhancing +this task.In this paper, we introduce a simple, efficient, and +clustering-oriented model based on unsupervised \textbf{G}raph Attention +\textbf{A}uto\textbf{E}ncoder for community detection in attributed networks +(GAECO). The proposed model adeptly learns representations from both the +network's topology and attribute information, simultaneously addressing dual +objectives: reconstruction and community discovery. It places a particular +emphasis on discovering compact communities by robustly minimizing clustering +errors. The model employs k-means as an objective function and utilizes a +multi-head Graph Attention Auto-Encoder for decoding the representations. +Experiments conducted on three datasets of attributed networks show that our +method surpasses state-of-the-art algorithms in terms of NMI and ARI. +Additionally, our approach scales effectively with the size of the network, +making it suitable for large-scale applications. The implications of our +findings extend beyond biological network interpretation and social network +analysis, where knowledge of the fundamental community structure is essential. + +
+
+ comment: 7 pages, 5 Figures +
+
+
+
+
+ + ♻ ☆ Do pretrained Transformers Really Learn In-context by Gradient Descent? + + +
+ Is In-Context Learning (ICL) implicitly equivalent to Gradient Descent (GD)? +Several recent works draw analogies between the dynamics of GD and the emergent +behavior of ICL in large language models. However, these works make assumptions +far from the realistic natural language setting in which language models are +trained. Therefore, such discrepancies between theory and practice necessitate +further investigation to validate their applicability. + We start by highlighting the assumptions in prior works that construct +Transformer weights to simulate gradient descent. Their experiments with +training Transformers on ICL objective, inconsistencies in the order +sensitivity of ICL and GD, sparsity of the constructed weights, and sensitivity +to parameter changes are some examples of mismatch from the real-world setting. + Furthermore, we probe and compare the ICL vs. GD hypothesis in a natural +setting. We conduct comprehensive empirical analyses on language models +pretrained on natural data (LLaMa-7B). Our comparisons on various performance +metrics highlight the inconsistent behavior of ICL and GD as a function of +various factors such as datasets, models, and the number of demonstrations. We +observe that ICL and GD modify the output distribution of language models +differently. These results indicate that the equivalence between ICL and GD is +an open hypothesis, requires nuanced considerations, and calls for further +studies. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 76 + +
+
+
+ + ☆ SEGIC: Unleashing the Emergent Correspondence for In-Context + Segmentation + + +
+ In-context segmentation aims at segmenting novel images using a few labeled +example images, termed as "in-context examples", exploring content similarities +between examples and the target. The resulting models can be generalized +seamlessly to novel segmentation tasks, significantly reducing the labeling and +training costs compared with conventional pipelines. However, in-context +segmentation is more challenging than classic ones due to its meta-learning +nature, requiring the model to learn segmentation rules conditioned on a few +samples, not just the segmentation. Unlike previous work with ad-hoc or +non-end-to-end designs, we propose SEGIC, an end-to-end segment-in-context +framework built upon a single vision foundation model (VFM). In particular, +SEGIC leverages the emergent correspondence within VFM to capture dense +relationships between target images and in-context samples. As such, +information from in-context samples is then extracted into three types of +instructions, i.e. geometric, visual, and meta instructions, serving as +explicit conditions for the final mask prediction. SEGIC is a straightforward +yet effective approach that yields state-of-the-art performance on one-shot +segmentation benchmarks. Notably, SEGIC can be easily generalized to diverse +tasks, including video object segmentation and open-vocabulary segmentation. +Code will be available at \url{https://github.com/MengLcool/SEGIC}. + +
+
+
+
+
+ + ☆ Understanding Self-Supervised Features for Learning Unsupervised + Instance Segmentation + + +
+ Self-supervised learning (SSL) can be used to solve complex visual tasks +without human labels. Self-supervised representations encode useful semantic +information about images, and as a result, they have already been used for +tasks such as unsupervised semantic segmentation. In this paper, we investigate +self-supervised representations for instance segmentation without any manual +annotations. We find that the features of different SSL methods vary in their +level of instance-awareness. In particular, DINO features, which are known to +be excellent semantic descriptors, lack behind MAE features in their +sensitivity for separating instances. + +
+
+
+
+
+ + ☆ Charting New Territories: Exploring the Geographic and Geospatial + Capabilities of Multimodal LLMs + + +
+ Multimodal large language models (MLLMs) have shown remarkable capabilities +across a broad range of tasks but their knowledge and abilities in the +geographic and geospatial domains are yet to be explored, despite potential +wide-ranging benefits to navigation, environmental research, urban development, +and disaster response. We conduct a series of experiments exploring various +vision capabilities of MLLMs within these domains, particularly focusing on the +frontier model GPT-4V, and benchmark its performance against open-source +counterparts. Our methodology involves challenging these models with a +small-scale geographic benchmark consisting of a suite of visual tasks, testing +their abilities across a spectrum of complexity. The analysis uncovers not only +where such models excel, including instances where they outperform humans, but +also where they falter, providing a balanced view of their capabilities in the +geographic domain. To enable the comparison and evaluation of future models, +our benchmark will be publicly released. + +
+
+
+
+
+ + ☆ Continuous football player tracking from discrete broadcast data + + +
+ Player tracking data remains out of reach for many professional football +teams as their video feeds are not sufficiently high quality for computer +vision technologies to be used. To help bridge this gap, we present a method +that can estimate continuous full-pitch tracking data from discrete data made +from broadcast footage. Such data could be collected by clubs or players at a +similar cost to event data, which is widely available down to semi-professional +level. We test our method using open-source tracking data, and include a +version that can be applied to a large set of over 200 games with such discrete +data. + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Unsupervised high-throughput segmentation of cells and cell nuclei in + quantitative phase images + + +
+ In the effort to aid cytologic diagnostics by establishing automatic single +cell screening using high throughput digital holographic microscopy for +clinical studies thousands of images and millions of cells are captured. The +bottleneck lies in an automatic, fast, and unsupervised segmentation technique +that does not limit the types of cells which might occur. We propose an +unsupervised multistage method that segments correctly without confusing noise +or reflections with cells and without missing cells that also includes the +detection of relevant inner structures, especially the cell nucleus in the +unstained cell. In an effort to make the information reasonable and +interpretable for cytopathologists, we also introduce new cytoplasmic and +nuclear features of potential help for cytologic diagnoses which exploit the +quantitative phase information inherent to the measurement scheme. We show that +the segmentation provides consistently good results over many experiments on +patient samples in a reasonable per cell analysis time. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Automated Detection and Counting of Windows using UAV Imagery based + Remote Sensing + + +
+ Despite the technological advancements in the construction and surveying +sector, the inspection of salient features like windows in an +under-construction or existing building is predominantly a manual process. +Moreover, the number of windows present in a building is directly related to +the magnitude of deformation it suffers under earthquakes. In this research, a +method to accurately detect and count the number of windows of a building by +deploying an Unmanned Aerial Vehicle (UAV) based remote sensing system is +proposed. The proposed two-stage method automates the identification and +counting of windows by developing computer vision pipelines that utilize data +from UAV's onboard camera and other sensors. Quantitative and Qualitative +results show the effectiveness of our proposed approach in accurately detecting +and counting the windows compared to the existing method. + +
+
+
+
+
+ + ☆ One Strike, You're Out: Detecting Markush Structures in Low + Signal-to-Noise Ratio Images + + +
+ Modern research increasingly relies on automated methods to assist +researchers. An example of this is Optical Chemical Structure Recognition +(OCSR), which aids chemists in retrieving information about chemicals from +large amounts of documents. Markush structures are chemical structures that +cannot be parsed correctly by OCSR and cause errors. The focus of this research +was to propose and test a novel method for classifying Markush structures. +Within this method, a comparison was made between fixed-feature extraction and +end-to-end learning (CNN). The end-to-end method performed significantly better +than the fixed-feature method, achieving 0.928 (0.035 SD) Macro F1 compared to +the fixed-feature method's 0.701 (0.052 SD). Because of the nature of the +experiment, these figures are a lower bound and can be improved further. These +results suggest that Markush structures can be filtered out effectively and +accurately using the proposed method. When implemented into OCSR pipelines, +this method can improve their performance and use to other researchers. + +
+
+ comment: 15 pages, 9 tables, 16 figures +
+
+
+
+
+ + ☆ CatVersion: Concatenating Embeddings for Diffusion-Based Text-to-Image + Personalization + + +
+ We propose CatVersion, an inversion-based method that learns the personalized +concept through a handful of examples. Subsequently, users can utilize text +prompts to generate images that embody the personalized concept, thereby +achieving text-to-image personalization. In contrast to existing approaches +that emphasize word embedding learning or parameter fine-tuning for the +diffusion model, which potentially causes concept dilution or overfitting, our +method concatenates embeddings on the feature-dense space of the text encoder +in the diffusion model to learn the gap between the personalized concept and +its base class, aiming to maximize the preservation of prior knowledge in +diffusion models while restoring the personalized concepts. To this end, we +first dissect the text encoder's integration in the image generation process to +identify the feature-dense space of the encoder. Afterward, we concatenate +embeddings on the Keys and Values in this space to learn the gap between the +personalized concept and its base class. In this way, the concatenated +embeddings ultimately manifest as a residual on the original attention output. +To more accurately and unbiasedly quantify the results of personalized image +generation, we improve the CLIP image alignment score based on masks. +Qualitatively and quantitatively, CatVersion helps to restore personalization +concepts more faithfully and enables more robust editing. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ ARIA: On the interaction between Architectures, Aggregation methods and + Initializations in federated visual classification + + +
+ Federated Learning (FL) is a collaborative training paradigm that allows for +privacy-preserving learning of cross-institutional models by eliminating the +exchange of sensitive data and instead relying on the exchange of model +parameters between the clients and a server. Despite individual studies on how +client models are aggregated, and, more recently, on the benefits of ImageNet +pre-training, there is a lack of understanding of the effect the architecture +chosen for the federation has, and of how the aforementioned elements +interconnect. To this end, we conduct the first joint +ARchitecture-Initialization-Aggregation study and benchmark ARIAs across a +range of medical image classification tasks. We find that, contrary to current +practices, ARIA elements have to be chosen together to achieve the best +possible performance. Our results also shed light on good choices for each +element depending on the task, the effect of normalisation layers, and the +utility of SSL pre-training, pointing to potential directions for designing +FL-specific architectures and training pipelines. + +
+
+ comment: Under review at the 21st IEEE International Symposium on Biomedical + Imaging +
+
+
+
+
+ + ☆ Neural Style Transfer for Computer Games + + +
+ Neural Style Transfer (NST) research has been applied to images, videos, 3D +meshes and radiance fields, but its application to 3D computer games remains +relatively unexplored. Whilst image and video NST systems can be used as a +post-processing effect for a computer game, this results in undesired artefacts +and diminished post-processing effects. Here, we present an approach for +injecting depth-aware NST as part of the 3D rendering pipeline. Qualitative and +quantitative experiments are used to validate our in-game stylisation +framework. We demonstrate temporally consistent results of artistically +stylised game scenes, outperforming state-of-the-art image and video NST +methods. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Animate124: Animating One Image to 4D Dynamic Scene + + +
+ We introduce Animate124 (Animate-one-image-to-4D), the first work to animate +a single in-the-wild image into 3D video through textual motion descriptions, +an underexplored problem with significant applications. Our 4D generation +leverages an advanced 4D grid dynamic Neural Radiance Field (NeRF) model, +optimized in three distinct stages using multiple diffusion priors. Initially, +a static model is optimized using the reference image, guided by 2D and 3D +diffusion priors, which serves as the initialization for the dynamic NeRF. +Subsequently, a video diffusion model is employed to learn the motion specific +to the subject. However, the object in the 3D videos tends to drift away from +the reference image over time. This drift is mainly due to the misalignment +between the text prompt and the reference image in the video diffusion model. +In the final stage, a personalized diffusion prior is therefore utilized to +address the semantic drift. As the pioneering image-text-to-4D generation +framework, our method demonstrates significant advancements over existing +baselines, evidenced by comprehensive quantitative and qualitative assessments. + +
+
+ comment: Project Page: https://animate124.github.io +
+
+
+
+
+ + ☆ Large Language Models as Automated Aligners for benchmarking + Vision-Language Models + + +
+ With the advancements in Large Language Models (LLMs), Vision-Language Models +(VLMs) have reached a new level of sophistication, showing notable competence +in executing intricate cognition and reasoning tasks. However, existing +evaluation benchmarks, primarily relying on rigid, hand-crafted datasets to +measure task-specific performance, face significant limitations in assessing +the alignment of these increasingly anthropomorphic models with human +intelligence. In this work, we address the limitations via Auto-Bench, which +delves into exploring LLMs as proficient aligners, measuring the alignment +between VLMs and human intelligence and value through automatic data curation +and assessment. Specifically, for data curation, Auto-Bench utilizes LLMs +(e.g., GPT-4) to automatically generate a vast set of question-answer-reasoning +triplets via prompting on visual symbolic representations (e.g., captions, +object locations, instance relationships, and etc.). The curated data closely +matches human intent, owing to the extensive world knowledge embedded in LLMs. +Through this pipeline, a total of 28.5K human-verified and 3,504K unfiltered +question-answer-reasoning triplets have been curated, covering 4 primary +abilities and 16 sub-abilities. We subsequently engage LLMs like GPT-3.5 to +serve as judges, implementing the quantitative and qualitative automated +assessments to facilitate a comprehensive evaluation of VLMs. Our validation +results reveal that LLMs are proficient in both evaluation data curation and +model assessment, achieving an average agreement rate of 85%. We envision +Auto-Bench as a flexible, scalable, and comprehensive benchmark for evaluating +the evolving sophisticated VLMs. + +
+
+
+
+
+ + ☆ Griffon: Spelling out All Object Locations at Any Granularity with Large + Language Models + + +
+ Replicating the innate human ability to detect all objects based on free-form +texts at any granularity remains a formidable challenge for Vision-Language +models. Current Large Vision Language Models (LVLMs) are predominantly +constrained to grounding a single, pre-existing object, relying solely on data +from Referring Expression Comprehension tasks. The limitation leads to a +compromise in model design, necessitating the introduction of visual expert +models or the integration of customized head structures. Beyond these +constraints, our research delves into the untapped potential of LVLMs and +uncover their inherent capability for basic object perception, allowing them to +accurately identify and locate objects of interest. Building on this insight, +we introduce a novel language-prompted localization dataset designed to fully +unleash the capabilities of LVLMs in integrating fine-grained object perception +with precise location awareness. More importantly, we present +$\textbf{Griffon}$, a purely LVLM-based baseline, which does not require the +introduction of any special tokens, expert models, or additional detection +modules. It simply maintains a consistent structure with popular LVLMs by +unifying data formats across various localization-related scenarios and is +trained end-to-end through a well-designed pipeline. Comprehensive experiments +demonstrate that $\textbf{Griffon}$ not only achieves state-of-the-art +performance on the fine-grained RefCOCO series but also approaches the +capabilities of the expert model Faster RCNN on the detection benchmark MSCOCO. + +
+
+ comment: Technical report. The codes and dataset will be released soon +
+
+
+
+
+ + ☆ Inferring Latent Class Statistics from Text for Robust Visual Few-Shot + Learning NeurIPS 2023 + + +
+ In the realm of few-shot learning, foundation models like CLIP have proven +effective but exhibit limitations in cross-domain robustness especially in +few-shot settings. Recent works add text as an extra modality to enhance the +performance of these models. Most of these approaches treat text as an +auxiliary modality without fully exploring its potential to elucidate the +underlying class visual features distribution. In this paper, we present a +novel approach that leverages text-derived statistics to predict the mean and +covariance of the visual feature distribution for each class. This predictive +framework enriches the latent space, yielding more robust and generalizable +few-shot learning models. We demonstrate the efficacy of incorporating both +mean and covariance statistics in improving few-shot classification performance +across various datasets. Our method shows that we can use text to predict the +mean and covariance of the distribution offering promising improvements in +few-shot learning scenarios. + +
+
+ comment: R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot Learning in + Foundation Models at NeurIPS 2023 +
+
+
+
+
+ + ☆ ToddlerDiffusion: Flash Interpretable Controllable Diffusion Model + + +
+ Diffusion-based generative models excel in perceptually impressive synthesis +but face challenges in interpretability. This paper introduces +ToddlerDiffusion, an interpretable 2D diffusion image-synthesis framework +inspired by the human generation system. Unlike traditional diffusion models +with opaque denoising steps, our approach decomposes the generation process +into simpler, interpretable stages; generating contours, a palette, and a +detailed colored image. This not only enhances overall performance but also +enables robust editing and interaction capabilities. Each stage is meticulously +formulated for efficiency and accuracy, surpassing Stable-Diffusion (LDM). +Extensive experiments on datasets like LSUN-Churches and COCO validate our +approach, consistently outperforming existing methods. ToddlerDiffusion +achieves notable efficiency, matching LDM performance on LSUN-Churches while +operating three times faster with a 3.76 times smaller architecture. Our source +code is provided in the supplementary material and will be publicly accessible. + +
+
+
+
+
+ + ☆ GaussianEditor: Swift and Controllable 3D Editing with Gaussian + Splatting + + +
+ 3D editing plays a crucial role in many areas such as gaming and virtual +reality. Traditional 3D editing methods, which rely on representations like +meshes and point clouds, often fall short in realistically depicting complex +scenes. On the other hand, methods based on implicit 3D representations, like +Neural Radiance Field (NeRF), render complex scenes effectively but suffer from +slow processing speeds and limited control over specific scene areas. In +response to these challenges, our paper presents GaussianEditor, an innovative +and efficient 3D editing algorithm based on Gaussian Splatting (GS), a novel 3D +representation. GaussianEditor enhances precision and control in editing +through our proposed Gaussian semantic tracing, which traces the editing target +throughout the training process. Additionally, we propose Hierarchical Gaussian +splatting (HGS) to achieve stabilized and fine results under stochastic +generative guidance from 2D diffusion models. We also develop editing +strategies for efficient object removal and integration, a challenging task for +existing methods. Our comprehensive experiments demonstrate GaussianEditor's +superior control, efficacy, and rapid performance, marking a significant +advancement in 3D editing. Project Page: +https://buaacyw.github.io/gaussian-editor/ + +
+
+ comment: Project Page: https://buaacyw.github.io/gaussian-editor/ +
+
+
+
+
+ + ☆ Multi-Class Anomaly Detection based on Regularized Discriminative + Coupled hypersphere-based Feature Adaptation + + +
+ In anomaly detection, identification of anomalies across diverse product +categories is a complex task. This paper introduces a new model by including +class discriminative properties obtained by a modified Regularized +Discriminative Variational Auto-Encoder (RD-VAE) in the feature extraction +process of Coupled-hypersphere-based Feature Adaptation (CFA). By doing so, the +proposed Regularized Discriminative Coupled-hypersphere-based Feature +Adaptation (RD-CFA), forms a solution for multi-class anomaly detection. By +using the discriminative power of RD-VAE to capture intricate class +distributions, combined with CFA's robust anomaly detection capability, the +proposed method excels in discerning anomalies across various classes. +Extensive evaluations on multi-class anomaly detection and localization using +the MVTec AD and BeanTech AD datasets showcase the effectiveness of RD-CFA +compared to eight leading contemporary methods. + +
+
+ comment: 14 pages, 6 figures, 6 tables +
+
+
+
+
+ + ☆ MVControl: Adding Conditional Control to Multi-view Diffusion for + Controllable Text-to-3D Generation + + +
+ We introduce MVControl, a novel neural network architecture that enhances +existing pre-trained multi-view 2D diffusion models by incorporating additional +input conditions, e.g. edge maps. Our approach enables the generation of +controllable multi-view images and view-consistent 3D content. To achieve +controllable multi-view image generation, we leverage MVDream as our base +model, and train a new neural network module as additional plugin for +end-to-end task-specific condition learning. To precisely control the shapes +and views of generated images, we innovatively propose a new conditioning +mechanism that predicts an embedding encapsulating the input spatial and view +conditions, which is then injected to the network globally. Once MVControl is +trained, score-distillation (SDS) loss based optimization can be performed to +generate 3D content, in which process we propose to use a hybrid diffusion +prior. The hybrid prior relies on a pre-trained Stable-Diffusion network and +our trained MVControl for additional guidance. Extensive experiments +demonstrate that our method achieves robust generalization and enables the +controllable generation of high-quality 3D content. + +
+
+
+
+
+ + ☆ Towards Interpretable Classification of Leukocytes based on Deep + Learning ICML 2023 + + +
+ Label-free approaches are attractive in cytological imaging due to their +flexibility and cost efficiency. They are supported by machine learning +methods, which, despite the lack of labeling and the associated lower contrast, +can classify cells with high accuracy where the human observer has little +chance to discriminate cells. In order to better integrate these workflows into +the clinical decision making process, this work investigates the calibration of +confidence estimation for the automated classification of leukocytes. In +addition, different visual explanation approaches are compared, which should +bring machine decision making closer to professional healthcare applications. +Furthermore, we were able to identify general detection patterns in neural +networks and demonstrate the utility of the presented approaches in different +scenarios of blood cell analysis. + +
+
+ comment: Presented at the 3rd Workshop on Interpretable Machine Learning in + Healthcare (IMLH) @ ICML 2023 +
+
+
+
+
+ + ☆ Sliding Window FastEdit: A Framework for Lesion Annotation in Whole-body + PET Images + + +
+ Deep learning has revolutionized the accurate segmentation of diseases in +medical imaging. However, achieving such results requires training with +numerous manual voxel annotations. This requirement presents a challenge for +whole-body Positron Emission Tomography (PET) imaging, where lesions are +scattered throughout the body. To tackle this problem, we introduce SW-FastEdit +- an interactive segmentation framework that accelerates the labeling by +utilizing only a few user clicks instead of voxelwise annotations. While prior +interactive models crop or resize PET volumes due to memory constraints, we use +the complete volume with our sliding window-based interactive scheme. Our model +outperforms existing non-sliding window interactive models on the AutoPET +dataset and generalizes to the previously unseen HECKTOR dataset. A user study +revealed that annotators achieve high-quality predictions with only 10 click +iterations and a low perceived NASA-TLX workload. Our framework is implemented +using MONAI Label and is available: +https://github.com/matt3o/AutoPET2-Submission/ + +
+
+ comment: 5 pages, 2 figures, 4 tables +
+
+
+
+
+ + ☆ Joint Diffusion: Mutual Consistency-Driven Diffusion Model for PET-MRI + Co-Reconstruction + + +
+ Positron Emission Tomography and Magnetic Resonance Imaging (PET-MRI) systems +can obtain functional and anatomical scans. PET suffers from a low +signal-to-noise ratio. Meanwhile, the k-space data acquisition process in MRI +is time-consuming. The study aims to accelerate MRI and enhance PET image +quality. Conventional approaches involve the separate reconstruction of each +modality within PET-MRI systems. However, there exists complementary +information among multi-modal images. The complementary information can +contribute to image reconstruction. In this study, we propose a novel PET-MRI +joint reconstruction model employing a mutual consistency-driven diffusion +mode, namely MC-Diffusion. MC-Diffusion learns the joint probability +distribution of PET and MRI for utilizing complementary information. We +conducted a series of contrast experiments about LPLS, Joint ISAT-net and +MC-Diffusion by the ADNI dataset. The results underscore the qualitative and +quantitative improvements achieved by MC-Diffusion, surpassing the +state-of-the-art method. + +
+
+
+
+
+ + ☆ MRxaI: Black-Box Explainability for Image Classifiers in a Medical + Setting + + +
+ Existing tools for explaining the output of image classifiers can be divided +into white-box, which rely on access to the model internals, and black-box, +agnostic to the model. As the usage of AI in the medical domain grows, so too +does the usage of explainability tools. Existing work on medical image +explanations focuses on white-box tools, such as gradcam. However, there are +clear advantages to switching to a black-box tool, including the ability to use +it with any classifier and the wide selection of black-box tools available. On +standard images, black-box tools are as precise as white-box. In this paper we +compare the performance of several black-box methods against gradcam on a brain +cancer MRI dataset. We demonstrate that most black-box tools are not suitable +for explaining medical image classifications and present a detailed analysis of +the reasons for their shortcomings. We also show that one black-box tool, a +causal explainability-based rex, performs as well as \gradcam. + +
+
+
+
+
+ + ☆ CT-xCOV: a CT-scan based Explainable Framework for COVid-19 diagnosis + + +
+ In this work, CT-xCOV, an explainable framework for COVID-19 diagnosis using +Deep Learning (DL) on CT-scans is developed. CT-xCOV adopts an end-to-end +approach from lung segmentation to COVID-19 detection and explanations of the +detection model's prediction. For lung segmentation, we used the well-known +U-Net model. For COVID-19 detection, we compared three different CNN +architectures: a standard CNN, ResNet50, and DenseNet121. After the detection, +visual and textual explanations are provided. For visual explanations, we +applied three different XAI techniques, namely, Grad-Cam, Integrated Gradient +(IG), and LIME. Textual explanations are added by computing the percentage of +infection by lungs. To assess the performance of the used XAI techniques, we +propose a ground-truth-based evaluation method, measuring the similarity +between the visualization outputs and the ground-truth infections. The +performed experiments show that the applied DL models achieved good results. +The U-Net segmentation model achieved a high Dice coefficient (98%). The +performance of our proposed classification model (standard CNN) was validated +using 5-fold cross-validation (acc of 98.40% and f1-score 98.23%). Lastly, the +results of the comparison of XAI techniques show that Grad-Cam gives the best +explanations compared to LIME and IG, by achieving a Dice coefficient of 55%, +on COVID-19 positive scans, compared to 29% and 24% obtained by IG and LIME +respectively. The code and the dataset used in this paper are available in the +GitHub repository [1]. + +
+
+
+
+
+ + ☆ IDD-AW: A Benchmark for Safe and Robust Segmentation of Drive Scenes in + Unstructured Traffic and Adverse Weather WACV 2024 + + +
+ Large-scale deployment of fully autonomous vehicles requires a very high +degree of robustness to unstructured traffic, and weather conditions, and +should prevent unsafe mispredictions. While there are several datasets and +benchmarks focusing on segmentation for drive scenes, they are not specifically +focused on safety and robustness issues. We introduce the IDD-AW dataset, which +provides 5000 pairs of high-quality images with pixel-level annotations, +captured under rain, fog, low light, and snow in unstructured driving +conditions. As compared to other adverse weather datasets, we provide i.) more +annotated images, ii.) paired Near-Infrared (NIR) image for each frame, iii.) +larger label set with a 4-level label hierarchy to capture unstructured traffic +conditions. We benchmark state-of-the-art models for semantic segmentation in +IDD-AW. We also propose a new metric called ''Safe mean Intersection over Union +(Safe mIoU)'' for hierarchical datasets which penalizes dangerous +mispredictions that are not captured in the traditional definition of mean +Intersection over Union (mIoU). The results show that IDD-AW is one of the most +challenging datasets to date for these tasks. The dataset and code will be +available here: http://iddaw.github.io. + +
+
+ comment: 8 pages excluding references. Accepted in WACV 2024 +
+
+
+
+
+ + ☆ Segment (Almost) Nothing: Prompt-Agnostic Adversarial Attacks on + Segmentation Models + + +
+ General purpose segmentation models are able to generate (semantic) +segmentation masks from a variety of prompts, including visual (points, boxed, +etc.) and textual (object names) ones. In particular, input images are +pre-processed by an image encoder to obtain embedding vectors which are later +used for mask predictions. Existing adversarial attacks target the end-to-end +tasks, i.e. aim at altering the segmentation mask predicted for a specific +image-prompt pair. However, this requires running an individual attack for each +new prompt for the same image. We propose instead to generate prompt-agnostic +adversarial attacks by maximizing the $\ell_2$-distance, in the latent space, +between the embedding of the original and perturbed images. Since the encoding +process only depends on the image, distorted image representations will cause +perturbations in the segmentation masks for a variety of prompts. We show that +even imperceptible $\ell_\infty$-bounded perturbations of radius +$\epsilon=1/255$ are often sufficient to drastically modify the masks predicted +with point, box and text prompts by recently proposed foundation models for +segmentation. Moreover, we explore the possibility of creating universal, i.e. +non image-specific, attacks which can be readily applied to any input without +further computational cost. + +
+
+
+
+
+ + ☆ GCPV: Guided Concept Projection Vectors for the Explainable Inspection + of CNN Feature Spaces + + +
+ For debugging and verification of computer vision convolutional deep neural +networks (CNNs) human inspection of the learned latent representations is +imperative. Therefore, state-of-the-art eXplainable Artificial Intelligence +(XAI) methods globally associate given natural language semantic concepts with +representing vectors or regions in the CNN latent space supporting manual +inspection. Yet, this approach comes with two major disadvantages: They are +locally inaccurate when reconstructing a concept label and discard information +about the distribution of concept instance representations. The latter, though, +is of particular interest for debugging, like finding and understanding +outliers, learned notions of sub-concepts, and concept confusion. Furthermore, +current single-layer approaches neglect that information about a concept may be +spread over the CNN depth. To overcome these shortcomings, we introduce the +local-to-global Guided Concept Projection Vectors (GCPV) approach: It (1) +generates local concept vectors that each precisely reconstruct a concept +segmentation label, and then (2) generalizes these to global concept and even +sub-concept vectors by means of hiearchical clustering. Our experiments on +object detectors demonstrate improved performance compared to the +state-of-the-art, the benefit of multi-layer concept vectors, and robustness +against low-quality concept segmentation labels. Finally, we demonstrate that +GCPVs can be applied to find root causes for confusion of concepts like bus and +truck, and reveal interesting concept-level outliers. Thus, GCPVs pose a +promising step towards interpretable model debugging and informed data +improvement. + +
+
+
+
+
+ + ☆ Deformable multi-modal image registration for the correlation between + optical measurements and histology images + + +
+ The correlation of optical measurements with a correct pathology label is +often hampered by imprecise registration caused by deformations in histology +images. This study explores an automated multi-modal image registration +technique utilizing deep learning principles to align snapshot breast specimen +images with corresponding histology images. The input images, acquired through +different modalities, present challenges due to variations in intensities and +structural visibility, making linear assumptions inappropriate. An unsupervised +and supervised learning approach, based on the VoxelMorph model, was explored, +making use of a dataset with manually registered images used as ground truth. +Evaluation metrics, including Dice scores and mutual information, reveal that +the unsupervised model outperforms the supervised (and manual approach) +significantly, achieving superior image alignment. This automated registration +approach holds promise for improving the validation of optical technologies by +minimizing human errors and inconsistencies associated with manual +registration. + +
+
+
+
+
+ + ☆ OneFormer3D: One Transformer for Unified Point Cloud Segmentation + + +
+ Semantic, instance, and panoptic segmentation of 3D point clouds have been +addressed using task-specific models of distinct design. Thereby, the +similarity of all segmentation tasks and the implicit relationship between them +have not been utilized effectively. This paper presents a unified, simple, and +effective model addressing all these tasks jointly. The model, named +OneFormer3D, performs instance and semantic segmentation consistently, using a +group of learnable kernels, where each kernel is responsible for generating a +mask for either an instance or a semantic category. These kernels are trained +with a transformer-based decoder with unified instance and semantic queries +passed as an input. Such a design enables training a model end-to-end in a +single run, so that it achieves top performance on all three segmentation tasks +simultaneously. Specifically, our OneFormer3D ranks 1st and sets a new +state-of-the-art (+2.1 mAP50) in the ScanNet test leaderboard. We also +demonstrate the state-of-the-art results in semantic, instance, and panoptic +segmentation of ScanNet (+21 PQ), ScanNet200 (+3.8 mAP50), and S3DIS (+0.8 +mIoU) datasets. + +
+
+
+
+
+ + ☆ Multi-scale Semantic Correlation Mining for Visible-Infrared Person + Re-Identification + + +
+ The main challenge in the Visible-Infrared Person Re-Identification (VI-ReID) +task lies in how to extract discriminative features from different modalities +for matching purposes. While the existing well works primarily focus on +minimizing the modal discrepancies, the modality information can not thoroughly +be leveraged. To solve this problem, a Multi-scale Semantic Correlation Mining +network (MSCMNet) is proposed to comprehensively exploit semantic features at +multiple scales and simultaneously reduce modality information loss as small as +possible in feature extraction. The proposed network contains three novel +components. Firstly, after taking into account the effective utilization of +modality information, the Multi-scale Information Correlation Mining Block +(MIMB) is designed to explore semantic correlations across multiple scales. +Secondly, in order to enrich the semantic information that MIMB can utilize, a +quadruple-stream feature extractor (QFE) with non-shared parameters is +specifically designed to extract information from different dimensions of the +dataset. Finally, the Quadruple Center Triplet Loss (QCT) is further proposed +to address the information discrepancy in the comprehensive features. Extensive +experiments on the SYSU-MM01, RegDB, and LLCM datasets demonstrate that the +proposed MSCMNet achieves the greatest accuracy. + +
+
+
+
+
+ + ☆ A Parameterized Generative Adversarial Network Using Cyclic Projection + for Explainable Medical Image Classification + + +
+ Although current data augmentation methods are successful to alleviate the +data insufficiency, conventional augmentation are primarily intra-domain while +advanced generative adversarial networks (GANs) generate images remaining +uncertain, particularly in small-scale datasets. In this paper, we propose a +parameterized GAN (ParaGAN) that effectively controls the changes of synthetic +samples among domains and highlights the attention regions for downstream +classification. Specifically, ParaGAN incorporates projection distance +parameters in cyclic projection and projects the source images to the decision +boundary to obtain the class-difference maps. Our experiments show that ParaGAN +can consistently outperform the existing augmentation methods with explainable +classification on two small-scale medical datasets. + +
+
+ comment: 5 pages, 4 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Highly Detailed and Temporal Consistent Video Stylization via + Synchronized Multi-Frame Diffusion + + +
+ Text-guided video-to-video stylization transforms the visual appearance of a +source video to a different appearance guided on textual prompts. Existing +text-guided image diffusion models can be extended for stylized video +synthesis. However, they struggle to generate videos with both highly detailed +appearance and temporal consistency. In this paper, we propose a synchronized +multi-frame diffusion framework to maintain both the visual details and the +temporal consistency. Frames are denoised in a synchronous fashion, and more +importantly, information of different frames is shared since the beginning of +the denoising process. Such information sharing ensures that a consensus, in +terms of the overall structure and color distribution, among frames can be +reached in the early stage of the denoising process before it is too late. The +optical flow from the original video serves as the connection, and hence the +venue for information sharing, among frames. We demonstrate the effectiveness +of our method in generating high-quality and diverse results in extensive +experiments. Our method shows superior qualitative and quantitative results +compared to state-of-the-art video editing methods. + +
+
+ comment: 11 pages, 11 figures +
+
+
+
+
+ + ☆ Towards Concept-based Interpretability of Skin Lesion Diagnosis using + Vision-Language Models + + +
+ Concept-based models naturally lend themselves to the development of +inherently interpretable skin lesion diagnosis, as medical experts make +decisions based on a set of visual patterns of the lesion. Nevertheless, the +development of these models depends on the existence of concept-annotated +datasets, whose availability is scarce due to the specialized knowledge and +expertise required in the annotation process. In this work, we show that +vision-language models can be used to alleviate the dependence on a large +number of concept-annotated samples. In particular, we propose an embedding +learning strategy to adapt CLIP to the downstream task of skin lesion +classification using concept-based descriptions as textual embeddings. Our +experiments reveal that vision-language models not only attain better accuracy +when using concepts as textual embeddings, but also require a smaller number of +concept-annotated samples to attain comparable performance to approaches +specifically devised for automatic concept generation. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ TVT: Training-Free Vision Transformer Search on Tiny Datasets + + +
+ Training-free Vision Transformer (ViT) architecture search is presented to +search for a better ViT with zero-cost proxies. While ViTs achieve significant +distillation gains from CNN teacher models on small datasets, the current +zero-cost proxies in ViTs do not generalize well to the distillation training +paradigm according to our experimental observations. In this paper, for the +first time, we investigate how to search in a training-free manner with the +help of teacher models and devise an effective Training-free ViT (TVT) search +framework. Firstly, we observe that the similarity of attention maps between +ViT and ConvNet teachers affects distill accuracy notably. Thus, we present a +teacher-aware metric conditioned on the feature attention relations between +teacher and student. Additionally, TVT employs the L2-Norm of the student's +weights as the student-capability metric to improve ranking consistency. +Finally, TVT searches for the best ViT for distilling with ConvNet teachers via +our teacher-aware metric and student-capability metric, resulting in impressive +gains in efficiency and effectiveness. Extensive experiments on various tiny +datasets and search spaces show that our TVT outperforms state-of-the-art +training-free search methods. The code will be released. + +
+
+
+
+
+ + ☆ Maximizing Discrimination Capability of Knowledge Distillation with + Energy-based Score + + +
+ To apply the latest computer vision techniques that require a large +computational cost in real industrial applications, knowledge distillation +methods (KDs) are essential. Existing logit-based KDs apply the constant +temperature scaling to all samples in dataset, limiting the utilization of +knowledge inherent in each sample individually. In our approach, we classify +the dataset into two categories (i.e., low energy and high energy samples) +based on their energy score. Through experiments, we have confirmed that low +energy samples exhibit high confidence scores, indicating certain predictions, +while high energy samples yield low confidence scores, meaning uncertain +predictions. To distill optimal knowledge by adjusting non-target class +predictions, we apply a higher temperature to low energy samples to create +smoother distributions and a lower temperature to high energy samples to +achieve sharper distributions. When compared to previous logit-based and +feature-based methods, our energy-based KD (Energy KD) achieves better +performance on various datasets. Especially, Energy KD shows significant +improvements on CIFAR-100-LT and ImageNet datasets, which contain many +challenging samples. Furthermore, we propose high energy-based data +augmentation (HE-DA) for further improving the performance. We demonstrate that +meaningful performance improvement could be achieved by augmenting only 20-50% +of dataset, suggesting that it can be employed on resource-limited devices. To +the best of our knowledge, this paper represents the first attempt to make use +of energy scores in KD and DA, and we believe it will greatly contribute to +future research. + +
+
+ comment: 22 pages, 4 figures. This work has been submitted to the Elsevier for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Binarized 3D Whole-body Human Mesh Recovery + + +
+ 3D whole-body human mesh recovery aims to reconstruct the 3D human body, +face, and hands from a single image. Although powerful deep learning models +have achieved accurate estimation in this task, they require enormous memory +and computational resources. Consequently, these methods can hardly be deployed +on resource-limited edge devices. In this work, we propose a Binarized Dual +Residual Network (BiDRN), a novel quantization method to estimate the 3D human +body, face, and hands parameters efficiently. Specifically, we design a basic +unit Binarized Dual Residual Block (BiDRB) composed of Local Convolution +Residual (LCR) and Block Residual (BR), which can preserve full-precision +information as much as possible. For LCR, we generalize it to four kinds of +convolutional modules so that full-precision information can be propagated even +between mismatched dimensions. We also binarize the face and hands +box-prediction network as Binaried BoxNet, which can further reduce the model +redundancy. Comprehensive quantitative and qualitative experiments demonstrate +the effectiveness of BiDRN, which has a significant improvement over +state-of-the-art binarization algorithms. Moreover, our proposed BiDRN achieves +comparable performance with full-precision method Hand4Whole while using just +22.1% parameters and 14.8% operations. We will release all the code and +pretrained models. + +
+
+ comment: The code will be available at https://github.com/ZHITENGLI/BiDRN +
+
+
+
+
+ + ☆ Stable Cluster Discrimination for Deep Clustering ICCV'23 + + +
+ Deep clustering can optimize representations of instances (i.e., +representation learning) and explore the inherent data distribution (i.e., +clustering) simultaneously, which demonstrates a superior performance over +conventional clustering methods with given features. However, the coupled +objective implies a trivial solution that all instances collapse to the uniform +features. To tackle the challenge, a two-stage training strategy is developed +for decoupling, where it introduces an additional pre-training stage for +representation learning and then fine-tunes the obtained model for clustering. +Meanwhile, one-stage methods are developed mainly for representation learning +rather than clustering, where various constraints for cluster assignments are +designed to avoid collapsing explicitly. Despite the success of these methods, +an appropriate learning objective tailored for deep clustering has not been +investigated sufficiently. In this work, we first show that the prevalent +discrimination task in supervised learning is unstable for one-stage clustering +due to the lack of ground-truth labels and positive instances for certain +clusters in each mini-batch. To mitigate the issue, a novel stable cluster +discrimination (SeCu) task is proposed and a new hardness-aware clustering +criterion can be obtained accordingly. Moreover, a global entropy constraint +for cluster assignments is studied with efficient optimization. Extensive +experiments are conducted on benchmark data sets and ImageNet. SeCu achieves +state-of-the-art performance on all of them, which demonstrates the +effectiveness of one-stage deep clustering. Code is available at +\url{https://github.com/idstcv/SeCu}. + +
+
+ comment: accepted by ICCV'23 +
+
+
+
+
+ + ☆ Cosine Similarity Knowledge Distillation for Individual Class + Information Transfer + + +
+ Previous logits-based Knowledge Distillation (KD) have utilized predictions +about multiple categories within each sample (i.e., class predictions) and have +employed Kullback-Leibler (KL) divergence to reduce the discrepancy between the +student and teacher predictions. Despite the proliferation of KD techniques, +the student model continues to fall short of achieving a similar level as +teachers. In response, we introduce a novel and effective KD method capable of +achieving results on par with or superior to the teacher models performance. We +utilize teacher and student predictions about multiple samples for each +category (i.e., batch predictions) and apply cosine similarity, a commonly used +technique in Natural Language Processing (NLP) for measuring the resemblance +between text embeddings. This metric's inherent scale-invariance property, +which relies solely on vector direction and not magnitude, allows the student +to dynamically learn from the teacher's knowledge, rather than being bound by a +fixed distribution of the teacher's knowledge. Furthermore, we propose a method +called cosine similarity weighted temperature (CSWT) to improve the +performance. CSWT reduces the temperature scaling in KD when the cosine +similarity between the student and teacher models is high, and conversely, it +increases the temperature scaling when the cosine similarity is low. This +adjustment optimizes the transfer of information from the teacher to the +student model. Extensive experimental results show that our proposed method +serves as a viable alternative to existing methods. We anticipate that this +approach will offer valuable insights for future research on model compression. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ GeoViT: A Versatile Vision Transformer Architecture for Geospatial Image + Analysis + + +
+ Greenhouse gases are pivotal drivers of climate change, necessitating precise +quantification and source identification to foster mitigation strategies. We +introduce GeoViT, a compact vision transformer model adept in processing +satellite imagery for multimodal segmentation, classification, and regression +tasks targeting CO2 and NO2 emissions. Leveraging GeoViT, we attain superior +accuracy in estimating power generation rates, fuel type, plume coverage for +CO2, and high-resolution NO2 concentration mapping, surpassing previous +state-of-the-art models while significantly reducing model size. GeoViT +demonstrates the efficacy of vision transformer architectures in harnessing +satellite-derived data for enhanced GHG emission insights, proving instrumental +in advancing climate change monitoring and emission regulation efforts +globally. + +
+
+ comment: Extended Abstract, Preprint +
+
+
+
+
+ + ☆ Decouple Content and Motion for Conditional Image-to-Video Generation + + +
+ The goal of conditional image-to-video (cI2V) generation is to create a +believable new video by beginning with the condition, i.e., one image and +text.The previous cI2V generation methods conventionally perform in RGB pixel +space, with limitations in modeling motion consistency and visual continuity. +Additionally, the efficiency of generating videos in pixel space is quite +low.In this paper, we propose a novel approach to address these challenges by +disentangling the target RGB pixels into two distinct components: spatial +content and temporal motions. Specifically, we predict temporal motions which +include motion vector and residual based on a 3D-UNet diffusion model. By +explicitly modeling temporal motions and warping them to the starting image, we +improve the temporal consistency of generated videos. This results in a +reduction of spatial redundancy, emphasizing temporal details. Our proposed +method achieves performance improvements by disentangling content and motion, +all without introducing new structural complexities to the model. Extensive +experiments on various datasets confirm our approach's superior performance +over the majority of state-of-the-art methods in both effectiveness and +efficiency. + +
+
+
+
+
+ + ☆ Paragraph-to-Image Generation with Information-Enriched Diffusion Model + + +
+ Text-to-image (T2I) models have recently experienced rapid development, +achieving astonishing performance in terms of fidelity and textual alignment +capabilities. However, given a long paragraph (up to 512 words), these +generation models still struggle to achieve strong alignment and are unable to +generate images depicting complex scenes. In this paper, we introduce an +information-enriched diffusion model for paragraph-to-image generation task, +termed ParaDiffusion, which delves into the transference of the extensive +semantic comprehension capabilities of large language models to the task of +image generation. At its core is using a large language model (e.g., Llama V2) +to encode long-form text, followed by fine-tuning with LORA to alignthe +text-image feature spaces in the generation task. To facilitate the training of +long-text semantic alignment, we also curated a high-quality paragraph-image +pair dataset, namely ParaImage. This dataset contains a small amount of +high-quality, meticulously annotated data, and a large-scale synthetic dataset +with long text descriptions being generated using a vision-language model. +Experiments demonstrate that ParaDiffusion outperforms state-of-the-art models +(SD XL, DeepFloyd IF) on ViLG-300 and ParaPrompts, achieving up to 15% and 45% +human voting rate improvements for visual appeal and text faithfulness, +respectively. The code and dataset will be released to foster community +research on long-text alignment. + +
+
+ comment: The project website is at: + https://weijiawu.github.io/ParaDiffusionPage/. Code: + https://github.com/weijiawu/ParaDiffusion +
+
+
+
+
+ + ☆ Image Super-Resolution with Text Prompt Diffusion + + +
+ Image super-resolution (SR) methods typically model degradation to improve +reconstruction accuracy in complex and unknown degradation scenarios. However, +extracting degradation information from low-resolution images is challenging, +which limits the model performance. To boost image SR performance, one feasible +approach is to introduce additional priors. Inspired by advancements in +multi-modal methods and text prompt image processing, we introduce text prompts +to image SR to provide degradation priors. Specifically, we first design a +text-image generation pipeline to integrate text into SR dataset through the +text degradation representation and degradation model. The text representation +applies a discretization manner based on the binning method to describe the +degradation abstractly. This representation method can also maintain the +flexibility of language. Meanwhile, we propose the PromptSR to realize the text +prompt SR. The PromptSR employs the diffusion model and the pre-trained +language model (e.g., T5 and CLIP). We train the model on the generated +text-image dataset. Extensive experiments indicate that introducing text +prompts into image SR, yields excellent results on both synthetic and +real-world images. Code: https://github.com/zhengchen1999/PromptSR. + +
+
+ comment: Code is available at https://github.com/zhengchen1999/PromptSR +
+
+
+
+
+ + ☆ Multi-modal Instance Refinement for Cross-domain Action Recognition + + +
+ Unsupervised cross-domain action recognition aims at adapting the model +trained on an existing labeled source domain to a new unlabeled target domain. +Most existing methods solve the task by directly aligning the feature +distributions of source and target domains. However, this would cause negative +transfer during domain adaptation due to some negative training samples in both +domains. In the source domain, some training samples are of low-relevance to +target domain due to the difference in viewpoints, action styles, etc. In the +target domain, there are some ambiguous training samples that can be easily +classified as another type of action under the case of source domain. The +problem of negative transfer has been explored in cross-domain object +detection, while it remains under-explored in cross-domain action recognition. +Therefore, we propose a Multi-modal Instance Refinement (MMIR) method to +alleviate the negative transfer based on reinforcement learning. Specifically, +a reinforcement learning agent is trained in both domains for every modality to +refine the training data by selecting out negative samples from each domain. +Our method finally outperforms several other state-of-the-art baselines in +cross-domain action recognition on the benchmark EPIC-Kitchens dataset, which +demonstrates the advantage of MMIR in reducing negative transfer. + +
+
+ comment: Accepted by PRCV 2023 +
+
+
+
+
+ + ☆ Latent Diffusion Prior Enhanced Deep Unfolding for Spectral Image + Reconstruction + + +
+ Snapshot compressive spectral imaging reconstruction aims to reconstruct +three-dimensional spatial-spectral images from a single-shot two-dimensional +compressed measurement. Existing state-of-the-art methods are mostly based on +deep unfolding structures but have intrinsic performance bottlenecks: $i$) the +ill-posed problem of dealing with heavily degraded measurement, and $ii$) the +regression loss-based reconstruction models being prone to recover images with +few details. In this paper, we introduce a generative model, namely the latent +diffusion model (LDM), to generate degradation-free prior to enhance the +regression-based deep unfolding method. Furthermore, to overcome the large +computational cost challenge in LDM, we propose a lightweight model to generate +knowledge priors in deep unfolding denoiser, and integrate these priors to +guide the reconstruction process for compensating high-quality spectral signal +details. Numeric and visual comparisons on synthetic and real-world datasets +illustrate the superiority of our proposed method in both reconstruction +quality and computational efficiency. Code will be released. + +
+
+
+
+
+ + ☆ Racing With ROS 2 A Navigation System for an Autonomous Formula Student + Race Car + + +
+ The advent of autonomous vehicle technologies has significantly impacted +various sectors, including motorsport, where Formula Student and Formula: +Society of Automotive Engineers introduced autonomous racing classes. These +offer new challenges to aspiring engineers, including the team at QUT +Motorsport, but also raise the entry barrier due to the complexity of +high-speed navigation and control. This paper presents an open-source solution +using the Robot Operating System 2, specifically its open-source navigation +stack, to address these challenges in autonomous Formula Student race cars. We +compare off-the-shelf navigation libraries that this stack comprises of against +traditional custom-made programs developed by QUT Motorsport to evaluate their +applicability in autonomous racing scenarios and integrate them onto an +autonomous race car. Our contributions include quantitative and qualitative +comparisons of these packages against traditional navigation solutions, aiming +to lower the entry barrier for autonomous racing. This paper also serves as a +comprehensive tutorial for teams participating in similar racing disciplines +and other autonomous mobile robot applications. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Cooperative Dual Attention for Audio-Visual Speech Enhancement with + Facial Cues BMVC 2023 + + +
+ In this work, we focus on leveraging facial cues beyond the lip region for +robust Audio-Visual Speech Enhancement (AVSE). The facial region, encompassing +the lip region, reflects additional speech-related attributes such as gender, +skin color, nationality, etc., which contribute to the effectiveness of AVSE. +However, static and dynamic speech-unrelated attributes also exist, causing +appearance changes during speech. To address these challenges, we propose a +Dual Attention Cooperative Framework, DualAVSE, to ignore speech-unrelated +information, capture speech-related information with facial cues, and +dynamically integrate it with the audio signal for AVSE. Specifically, we +introduce a spatial attention-based visual encoder to capture and enhance +visual speech information beyond the lip region, incorporating global facial +context and automatically ignoring speech-unrelated information for robust +visual feature extraction. Additionally, a dynamic visual feature fusion +strategy is introduced by integrating a temporal-dimensional self-attention +module, enabling the model to robustly handle facial variations. The acoustic +noise in the speaking process is variable, impacting audio quality. Therefore, +a dynamic fusion strategy for both audio and visual features is introduced to +address this issue. By integrating cooperative dual attention in the visual +encoder and audio-visual fusion strategy, our model effectively extracts +beneficial speech information from both audio and visual cues for AVSE. +Thorough analysis and comparison on different datasets, including normal and +challenging cases with unreliable or absent visual information, consistently +show our model outperforming existing methods across multiple metrics. + +
+
+ comment: Accepted to BMVC 2023 15 pages, 2 figures +
+
+
+
+
+ + ☆ CRISP: Hybrid Structured Sparsity for Class-aware Model Pruning DATE + + +
+ Machine learning pipelines for classification tasks often train a universal +model to achieve accuracy across a broad range of classes. However, a typical +user encounters only a limited selection of classes regularly. This disparity +provides an opportunity to enhance computational efficiency by tailoring models +to focus on user-specific classes. Existing works rely on unstructured pruning, +which introduces randomly distributed non-zero values in the model, making it +unsuitable for hardware acceleration. Alternatively, some approaches employ +structured pruning, such as channel pruning, but these tend to provide only +minimal compression and may lead to reduced model accuracy. In this work, we +propose CRISP, a novel pruning framework leveraging a hybrid structured +sparsity pattern that combines both fine-grained N:M structured sparsity and +coarse-grained block sparsity. Our pruning strategy is guided by a +gradient-based class-aware saliency score, allowing us to retain weights +crucial for user-specific classes. CRISP achieves high accuracy with minimal +memory consumption for popular models like ResNet-50, VGG-16, and MobileNetV2 +on ImageNet and CIFAR-100 datasets. Moreover, CRISP delivers up to 14$\times$ +reduction in latency and energy consumption compared to existing pruning +methods while maintaining comparable accuracy. Our code is available at +https://github.com/shivmgg/CRISP/. + +
+
+ comment: 6 pages, accepted in Design, Automation & Test in Europe Conference & + Exhibition (DATE) 2024 +
+
+
+
+
+ + ☆ Segmentation-Based Parametric Painting + + +
+ We introduce a novel image-to-painting method that facilitates the creation +of large-scale, high-fidelity paintings with human-like quality and stylistic +variation. To process large images and gain control over the painting process, +we introduce a segmentation-based painting process and a dynamic attention map +approach inspired by human painting strategies, allowing optimization of brush +strokes to proceed in batches over different image regions, thereby capturing +both large-scale structure and fine details, while also allowing stylistic +control over detail. Our optimized batch processing and patch-based loss +framework enable efficient handling of large canvases, ensuring our painted +outputs are both aesthetically compelling and functionally superior as compared +to previous methods, as confirmed by rigorous evaluations. Code available at: +https://github.com/manuelladron/semantic\_based\_painting.git + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Bursting Spikes: Efficient and High-performance SNNs for Event-based + Vision + + +
+ Advancing event-driven vision through spiking neural networks (SNNs) is +crucial to empowering high-speed and efficient perception. While directly +converting the pre-trained artificial neural networks (ANNs) - by replacing the +non-linear activation with spiking neurons - can provide SNNs with good +performance, the resultant SNNs typically demand long timesteps and high energy +consumption to achieve their optimal performance. To address this challenge, we +introduce the burst-spike mechanism inspired by the biological nervous system, +allowing multiple spikes per timestep to reduce conversion errors and produce +low-latency SNNs. To further bolster this enhancement, we leverage the Pareto +Frontier-driven algorithm to reallocate burst-firing patterns. Moreover, to +reduce energy consumption during the conversion process, we propose a +sensitivity-driven spike compression technique, which automatically locates the +optimal threshold ratio according to layer-specific sensitivity. Extensive +experiments demonstrate our approach outperforms state-of-the-art SNN methods, +showcasing superior performance and reduced energy usage across classification +and object detection. Our code will be available at +https://github.com/bic-L/burst-ann2snn. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ ZeroPS: High-quality Cross-modal Knowledge Transfer for Zero-Shot 3D + Part Segmentation + + +
+ Recently, many 2D pretrained foundational models have demonstrated impressive +zero-shot prediction capabilities. In this work, we design a novel pipeline for +zero-shot 3D part segmentation, called ZeroPS. It high-quality transfers +knowledge from 2D pretrained foundational models to 3D point clouds. The main +idea of our approach is to explore the natural relationship between multi-view +correspondences and the prompt mechanism of foundational models and build +bridges on it. Our pipeline consists of two components: 1) a self-extension +component that extends 2D groups from a single viewpoint to spatial +global-level 3D groups; 2) a multi-modal labeling component that introduces a +two-dimensional checking mechanism to vote each 2D predicted bounding box to +the best matching 3D part, and a Class Non-highest Vote Penalty function to +refine the Vote Matrix. Additionally, a merging algorithm is included to merge +part-level 3D groups. Extensive evaluation of three zero-shot segmentation +tasks on PartnetE datasets, achieving state-of-the-art results with significant +improvements (+19.6%, +5.2% and +4.9%, respectively) over existing methods. Our +proposed approach does not need any training, fine-tuning or learnable +parameters. It is hardly affected by domain shift. The code will be released. + +
+
+ comment: 11 pages, 6 figures; references added +
+
+
+
+
+ + ☆ RSB-Pose: Robust Short-Baseline Binocular 3D Human Pose Estimation with + Occlusion Handling + + +
+ In the domain of 3D Human Pose Estimation, which finds widespread daily +applications, the requirement for convenient acquisition equipment continues to +grow. To satisfy this demand, we set our sights on a short-baseline binocular +setting that offers both portability and a geometric measurement property that +radically mitigates depth ambiguity. However, as the binocular baseline +shortens, two serious challenges emerge: first, the robustness of 3D +reconstruction against 2D errors deteriorates; and second, occlusion reoccurs +due to the limited visual differences between two views. To address the first +challenge, we propose the Stereo Co-Keypoints Estimation module to improve the +view consistency of 2D keypoints and enhance the 3D robustness. In this module, +the disparity is utilized to represent the correspondence of binocular 2D +points and the Stereo Volume Feature is introduced to contain binocular +features across different disparities. Through the regression of SVF, two-view +2D keypoints are simultaneously estimated in a collaborative way which +restricts their view consistency. Furthermore, to deal with occlusions, a +Pre-trained Pose Transformer module is introduced. Through this module, 3D +poses are refined by perceiving pose coherence, a representation of joint +correlations. This perception is injected by the Pose Transformer network and +learned through a pre-training task that recovers iterative masked joints. +Comprehensive experiments carried out on H36M and MHAD datasets, complemented +by visualizations, validate the effectiveness of our approach in the +short-baseline binocular 3D Human Pose Estimation and occlusion handling. + +
+
+ comment: 13 pages, 8 figures, currently under review at IEEE Transactions on + Image Processing journal +
+
+
+
+
+ + ☆ Pseudo-label Correction for Instance-dependent Noise Using + Teacher-student Framework + + +
+ The high capacity of deep learning models to learn complex patterns poses a +significant challenge when confronted with label noise. The inability to +differentiate clean and noisy labels ultimately results in poor generalization. +We approach this problem by reassigning the label for each image using a new +teacher-student based framework termed P-LC (pseudo-label correction). +Traditional teacher-student networks are composed of teacher and student +classifiers for knowledge distillation. In our novel approach, we reconfigure +the teacher network into a triple encoder, leveraging the triplet loss to +establish a pseudo-label correction system. As the student generates pseudo +labels for a set of given images, the teacher learns to choose between the +initially assigned labels and the pseudo labels. Experiments on MNIST, +Fashion-MNIST, and SVHN demonstrate P-LC's superior performance over existing +state-of-the-art methods across all noise levels, most notably in high noise. +In addition, we introduce a noise level estimation to help assess model +performance and inform the need for additional data cleaning procedures. + +
+
+
+
+
+ + ♻ ☆ Visual Dexterity: In-Hand Reorientation of Novel and Complex Object + Shapes + + +
+ In-hand object reorientation is necessary for performing many dexterous +manipulation tasks, such as tool use in less structured environments that +remain beyond the reach of current robots. Prior works built reorientation +systems assuming one or many of the following: reorienting only specific +objects with simple shapes, limited range of reorientation, slow or quasistatic +manipulation, simulation-only results, the need for specialized and costly +sensor suites, and other constraints which make the system infeasible for +real-world deployment. We present a general object reorientation controller +that does not make these assumptions. It uses readings from a single commodity +depth camera to dynamically reorient complex and new object shapes by any +rotation in real-time, with the median reorientation time being close to seven +seconds. The controller is trained using reinforcement learning in simulation +and evaluated in the real world on new object shapes not used for training, +including the most challenging scenario of reorienting objects held in the air +by a downward-facing hand that must counteract gravity during reorientation. +Our hardware platform only uses open-source components that cost less than five +thousand dollars. Although we demonstrate the ability to overcome assumptions +in prior work, there is ample scope for improving absolute performance. For +instance, the challenging duck-shaped object not used for training was dropped +in 56 percent of the trials. When it was not dropped, our controller reoriented +the object within 0.4 radians (23 degrees) 75 percent of the time. Videos are +available at: https://taochenshh.github.io/projects/visual-dexterity. + +
+
+ comment: Published in Science Robotics: + https://www.science.org/doi/10.1126/scirobotics.adc9244 +
+
+
+
+
+ + ♻ ☆ Multi-Visual-Inertial System: Analysis, Calibration and Estimation + + +
+ In this paper, we study state estimation of multi-visual-inertial systems +(MVIS) and develop sensor fusion algorithms to optimally fuse an arbitrary +number of asynchronous inertial measurement units (IMUs) or gyroscopes and +global and(or) rolling shutter cameras. We are especially interested in the +full calibration of the associated visual-inertial sensors, including the IMU +or camera intrinsics and the IMU-IMU(or camera) spatiotemporal extrinsics as +well as the image readout time of rolling-shutter cameras (if used). To this +end, we develop a new analytic combined IMU integration with intrinsics-termed +ACI3-to preintegrate IMU measurements, which is leveraged to fuse auxiliary +IMUs and(or) gyroscopes alongside a base IMU. We model the multi-inertial +measurements to include all the necessary inertial intrinsic and IMU-IMU +spatiotemporal extrinsic parameters, while leveraging IMU-IMU rigid-body +constraints to eliminate the necessity of auxiliary inertial poses and thus +reducing computational complexity. By performing observability analysis of +MVIS, we prove that the standard four unobservable directions remain - no +matter how many inertial sensors are used, and also identify, for the first +time, degenerate motions for IMU-IMU spatiotemporal extrinsics and auxiliary +inertial intrinsics. In addition to the extensive simulations that validate our +analysis and algorithms, we have built our own MVIS sensor rig and collected +over 25 real-world datasets to experimentally verify the proposed calibration +against the state-of-the-art calibration method such as Kalibr. We show that +the proposed MVIS calibration is able to achieve competing accuracy with +improved convergence and repeatability, which is open sourced to better benefit +the community. + +
+
+
+
+
+ + ♻ ☆ CrossGET: Cross-Guided Ensemble of Tokens for Accelerating + Vision-Language Transformers + + +
+ Recent vision-language models have achieved tremendous progress far beyond +what we ever expected. However, their computational costs are also dramatically +growing with rapid development, especially for the large models. It makes model +acceleration exceedingly critical in a scenario of limited resources. Although +extensively studied for unimodal models, the acceleration for multimodal +models, especially the vision-language Transformers, is relatively +under-explored. To pursue more efficient and accessible vision-language +Transformers, this paper introduces \textbf{Cross}-\textbf{G}uided +\textbf{E}nsemble of \textbf{T}okens (\textbf{\emph{CrossGET}}), a universal +acceleration framework for vision-language Transformers. This framework +adaptively combines tokens through real-time, cross-modal guidance, thereby +achieving substantial acceleration while keeping high performance. +\textit{CrossGET} has two key innovations: 1) \textit{Cross-Guided Matching and +Ensemble}. \textit{CrossGET} incorporates cross-modal guided token matching and +ensemble to exploit cross-modal information effectively, only introducing +cross-modal tokens with negligible extra parameters. 2) \textit{Complete-Graph +Soft Matching}. In contrast to the existing bipartite soft matching approach, +\textit{CrossGET} introduces a complete-graph soft matching policy to achieve +more reliable token-matching results while maintaining parallelizability and +high efficiency. Extensive experiments are conducted on various vision-language +tasks, including image-text retrieval, visual reasoning, image captioning, and +visual question answering. Performance on both classic multimodal architectures +and emerging multimodal LLMs demonstrate the effectiveness and versatility of +the proposed \textit{CrossGET} framework. The code will be at +\url{https://github.com/sdc17/CrossGET}. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ♻ ☆ CMX: Cross-Modal Fusion for RGB-X Semantic Segmentation with + Transformers + + +
+ Scene understanding based on image segmentation is a crucial component of +autonomous vehicles. Pixel-wise semantic segmentation of RGB images can be +advanced by exploiting complementary features from the supplementary modality +(X-modality). However, covering a wide variety of sensors with a +modality-agnostic model remains an unresolved problem due to variations in +sensor characteristics among different modalities. Unlike previous +modality-specific methods, in this work, we propose a unified fusion framework, +CMX, for RGB-X semantic segmentation. To generalize well across different +modalities, that often include supplements as well as uncertainties, a unified +cross-modal interaction is crucial for modality fusion. Specifically, we design +a Cross-Modal Feature Rectification Module (CM-FRM) to calibrate bi-modal +features by leveraging the features from one modality to rectify the features +of the other modality. With rectified feature pairs, we deploy a Feature Fusion +Module (FFM) to perform sufficient exchange of long-range contexts before +mixing. To verify CMX, for the first time, we unify five modalities +complementary to RGB, i.e., depth, thermal, polarization, event, and LiDAR. +Extensive experiments show that CMX generalizes well to diverse multi-modal +fusion, achieving state-of-the-art performances on five RGB-Depth benchmarks, +as well as RGB-Thermal, RGB-Polarization, and RGB-LiDAR datasets. Besides, to +investigate the generalizability to dense-sparse data fusion, we establish an +RGB-Event semantic segmentation benchmark based on the EventScape dataset, on +which CMX sets the new state-of-the-art. The source code of CMX is publicly +available at https://github.com/huaaaliu/RGBX_Semantic_Segmentation. + +
+
+ comment: Accepted to IEEE Transactions on Intelligent Transportation Systems + (T-ITS). The source code of CMX is publicly available at + https://github.com/huaaaliu/RGBX_Semantic_Segmentation +
+
+
+
+
+ + ♻ ☆ Knowledge Accumulation in Continually Learned Representations and the + Issue of Feature Forgetting + + +
+ While it is established that neural networks suffer from catastrophic +forgetting ``at the output level'', it is debated whether this is also the case +at the level of representations. Some studies ascribe a certain level of innate +robustness to representations, that they only forget minimally and no critical +information, while others claim that representations are also severely affected +by forgetting. To settle this debate, we first discuss how this apparent +disagreement might stem from the coexistence of two phenomena that affect the +quality of continually learned representations: knowledge accumulation and +feature forgetting. We then show that, even though it is true that feature +forgetting can be small in absolute terms, newly learned information is +forgotten just as catastrophically at the level of representations as it is at +the output level. Next we show that this feature forgetting is problematic as +it substantially slows down knowledge accumulation. We further show that +representations that are continually learned through both supervised and +self-supervised learning suffer from feature forgetting. Finally, we study how +feature forgetting and knowledge accumulation are affected by different types +of continual learning methods. + +
+
+
+
+
+ + ♻ ☆ Monkey: Image Resolution and Text Label Are Important Things for Large + Multi-modal Models + + +
+ Large Multimodal Models (LMMs) have shown promise in vision-language tasks +but struggle with high-resolution input and detailed scene understanding. +Addressing these challenges, we introduce Monkey to enhance LMM capabilities. +Firstly, Monkey processes input images by dividing them into uniform patches, +each matching the size (e.g., 448x448) used in the original training of the +well-trained vision encoder. Equipped with individual adapter for each patch, +Monkey can handle higher resolutions up to 1344x896 pixels, enabling the +detailed capture of complex visual information. Secondly, it employs a +multi-level description generation method, enriching the context for +scene-object associations. This two-part strategy ensures more effective +learning from generated data: the higher resolution allows for a more detailed +capture of visuals, which in turn enhances the effectiveness of comprehensive +descriptions. Extensive ablative results validate the effectiveness of our +designs. Additionally, experiments on 18 datasets further demonstrate that +Monkey surpasses existing LMMs in many tasks like Image Captioning and various +Visual Question Answering formats. Specially, in qualitative tests focused on +dense text question answering, Monkey has exhibited encouraging results +compared with GPT4V. Code is available at +https://github.com/Yuliang-Liu/Monkey. + +
+
+
+
+
+ + ♻ ☆ Diagonal Hierarchical Consistency Learning for Semi-supervised Medical + Image Segmentation + + +
+ Medical image segmentation, which is essential for many clinical +applications, has achieved almost human-level performance via data-driven deep +learning technologies. Nevertheless, its performance is predicated upon the +costly process of manually annotating a vast amount of medical images. To this +end, we propose a novel framework for robust semi-supervised medical image +segmentation using diagonal hierarchical consistency learning (DiHC-Net). +First, it is composed of multiple sub-models with identical multi-scale +architecture but with distinct sub-layers, such as up-sampling and +normalisation layers. Second, with mutual consistency, a novel consistency +regularisation is enforced between one model's intermediate and final +prediction and soft pseudo labels from other models in a diagonal hierarchical +fashion. A series of experiments verifies the efficacy of our simple framework, +outperforming all previous approaches on public Left Atrium (LA) dataset. + +
+
+ comment: 5 pages, 2 figures, and 2 tables +
+
+
+
+
+ + ♻ ☆ Hawkeye: A PyTorch-based Library for Fine-Grained Image Recognition with + Deep Learning + + +
+ Fine-Grained Image Recognition (FGIR) is a fundamental and challenging task +in computer vision and multimedia that plays a crucial role in Intellectual +Economy and Industrial Internet applications. However, the absence of a unified +open-source software library covering various paradigms in FGIR poses a +significant challenge for researchers and practitioners in the field. To +address this gap, we present Hawkeye, a PyTorch-based library for FGIR with +deep learning. Hawkeye is designed with a modular architecture, emphasizing +high-quality code and human-readable configuration, providing a comprehensive +solution for FGIR tasks. In Hawkeye, we have implemented 16 state-of-the-art +fine-grained methods, covering 6 different paradigms, enabling users to explore +various approaches for FGIR. To the best of our knowledge, Hawkeye represents +the first open-source PyTorch-based library dedicated to FGIR. It is publicly +available at https://github.com/Hawkeye-FineGrained/Hawkeye/, providing +researchers and practitioners with a powerful tool to advance their research +and development in the field of FGIR. + +
+
+ comment: ACM Multimedia 2023 Open Source Software Competition Winner Entry. + X.-S. Wei is the corresponding author +
+
+
+
+
+ + ♻ ☆ Interpretable and intervenable ultrasonography-based machine learning + models for pediatric appendicitis + + +
+ Appendicitis is among the most frequent reasons for pediatric abdominal +surgeries. Previous decision support systems for appendicitis have focused on +clinical, laboratory, scoring, and computed tomography data and have ignored +abdominal ultrasound, despite its noninvasive nature and widespread +availability. In this work, we present interpretable machine learning models +for predicting the diagnosis, management and severity of suspected appendicitis +using ultrasound images. Our approach utilizes concept bottleneck models (CBM) +that facilitate interpretation and interaction with high-level concepts +understandable to clinicians. Furthermore, we extend CBMs to prediction +problems with multiple views and incomplete concept sets. Our models were +trained on a dataset comprising 579 pediatric patients with 1709 ultrasound +images accompanied by clinical and laboratory data. Results show that our +proposed method enables clinicians to utilize a human-understandable and +intervenable predictive model without compromising performance or requiring +time-consuming image annotation when deployed. For predicting the diagnosis, +the extended multiview CBM attained an AUROC of 0.80 and an AUPR of 0.92, +performing comparably to similar black-box neural networks trained and tested +on the same dataset. + +
+
+ comment: Published in Medical Image Analysis (Elsevier) +
+
+
+
+
+ + ♻ ☆ Upgrading VAE Training With Unlimited Data Plans Provided by Diffusion + Models + + +
+ Variational autoencoders (VAEs) are popular models for representation +learning but their encoders are susceptible to overfitting (Cremer et al., +2018) because they are trained on a finite training set instead of the true +(continuous) data distribution $p_{\mathrm{data}}(\mathbf{x})$. Diffusion +models, on the other hand, avoid this issue by keeping the encoder fixed. This +makes their representations less interpretable, but it simplifies training, +enabling accurate and continuous approximations of +$p_{\mathrm{data}}(\mathbf{x})$. In this paper, we show that overfitting +encoders in VAEs can be effectively mitigated by training on samples from a +pre-trained diffusion model. These results are somewhat unexpected as recent +findings (Alemohammad et al., 2023; Shumailov et al., 2023) observe a decay in +generative performance when models are trained on data generated by another +generative model. We analyze generalization performance, amortization gap, and +robustness of VAEs trained with our proposed method on three different data +sets. We find improvements in all metrics compared to both normal training and +conventional data augmentation methods, and we show that a modest amount of +samples from the diffusion model suffices to obtain these gains. + +
+
+ comment: 9 pages + appendix +
+
+
+
+
+ + ♻ ☆ DeepDC: Deep Distance Correlation as a Perceptual Image Quality + Evaluator + + +
+ ImageNet pre-trained deep neural networks (DNNs) show notable transferability +for building effective image quality assessment (IQA) models. Such a remarkable +byproduct has often been identified as an emergent property in previous +studies. In this work, we attribute such capability to the intrinsic +texture-sensitive characteristic that classifies images using texture features. +We fully exploit this characteristic to develop a novel full-reference IQA +(FR-IQA) model based exclusively on pre-trained DNN features. Specifically, we +compute the distance correlation, a highly promising yet relatively +under-investigated statistic, between reference and distorted images in the +deep feature domain. In addition, the distance correlation quantifies both +linear and nonlinear feature relationships, which is far beyond the widely used +first-order and second-order statistics in the feature space. We conduct +comprehensive experiments to demonstrate the superiority of the proposed +quality model on five standard IQA datasets, one perceptual similarity dataset, +two texture similarity datasets, and one geometric transformation dataset. +Moreover, we optimize the proposed model to generate a broad spectrum of +texture patterns, by treating the model as the style loss function for neural +style transfer (NST). Extensive experiments demonstrate that the proposed +texture synthesis and NST methods achieve the best quantitative and qualitative +results. We release our code at https://github.com/h4nwei/DeepDC. + +
+
+
+
+
+ + ♻ ☆ RBPGAN: Recurrent Back-Projection GAN for Video Super Resolution + + +
+ Recently, video super resolution (VSR) has become a very impactful task in +the area of Computer Vision due to its various applications. In this paper, we +propose Recurrent Back-Projection Generative Adversarial Network (RBPGAN) for +VSR in an attempt to generate temporally coherent solutions while preserving +spatial details. RBPGAN integrates two state-of-the-art models to get the best +in both worlds without compromising the accuracy of produced video. The +generator of the model is inspired by RBPN system, while the discriminator is +inspired by TecoGAN. We also utilize Ping-Pong loss to increase temporal +consistency over time. Our contribution together results in a model that +outperforms earlier work in terms of temporally consistent details, as we will +demonstrate qualitatively and quantitatively using different datasets. + +
+
+
+
+
+ + ♻ ☆ Improved Breast Cancer Diagnosis through Transfer Learning on + Hematoxylin and Eosin Stained Histology Images + + +
+ Breast cancer is one of the leading causes of death for women worldwide. +Early screening is essential for early identification, but the chance of +survival declines as the cancer progresses into advanced stages. For this +study, the most recent BRACS dataset of histological (H\&E) stained images was +used to classify breast cancer tumours, which contains both the whole-slide +images (WSI) and region-of-interest (ROI) images, however, for our study we +have considered ROI images. We have experimented using different pre-trained +deep learning models, such as Xception, EfficientNet, ResNet50, and +InceptionResNet, pre-trained on the ImageNet weights. We pre-processed the +BRACS ROI along with image augmentation, upsampling, and dataset split +strategies. For the default dataset split, the best results were obtained by +ResNet50 achieving 66% f1-score. For the custom dataset split, the best results +were obtained by performing upsampling and image augmentation which results in +96.2% f1-score. Our second approach also reduced the number of false positive +and false negative classifications to less than 3% for each class. We believe +that our study significantly impacts the early diagnosis and identification of +breast cancer tumors and their subtypes, especially atypical and malignant +tumors, thus improving patient outcomes and reducing patient mortality rates. +Overall, this study has primarily focused on identifying seven (7) breast +cancer tumor subtypes, and we believe that the experimental models can be +fine-tuned further to generalize over previous breast cancer histology datasets +as well. + +
+
+ comment: 12 pages, 4 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Scale-Adaptive Feature Aggregation for Efficient Space-Time Video + Super-Resolution WACV2024 + + +
+ The Space-Time Video Super-Resolution (STVSR) task aims to enhance the visual +quality of videos, by simultaneously performing video frame interpolation (VFI) +and video super-resolution (VSR). However, facing the challenge of the +additional temporal dimension and scale inconsistency, most existing STVSR +methods are complex and inflexible in dynamically modeling different motion +amplitudes. In this work, we find that choosing an appropriate processing scale +achieves remarkable benefits in flow-based feature propagation. We propose a +novel Scale-Adaptive Feature Aggregation (SAFA) network that adaptively selects +sub-networks with different processing scales for individual samples. +Experiments on four public STVSR benchmarks demonstrate that SAFA achieves +state-of-the-art performance. Our SAFA network outperforms recent +state-of-the-art methods such as TMNet and VideoINR by an average improvement +of over 0.5dB on PSNR, while requiring less than half the number of parameters +and only 1/3 computational costs. + +
+
+ comment: WACV2024, 16 pages +
+
+
+
+
+ + ♻ ☆ Dynamic Sub-Cluster-Aware Network for Few-Shot Skin Disease + Classification + + +
+ This paper addresses the problem of few-shot skin disease classification by +introducing a novel approach called the Sub-Cluster-Aware Network (SCAN) that +enhances accuracy in diagnosing rare skin diseases. The key insight motivating +the design of SCAN is the observation that skin disease images within a class +often exhibit multiple sub-clusters, characterized by distinct variations in +appearance. To improve the performance of few-shot learning, we focus on +learning a high-quality feature encoder that captures the unique sub-clustered +representations within each disease class, enabling better characterization of +feature distributions. Specifically, SCAN follows a dual-branch framework, +where the first branch learns class-wise features to distinguish different skin +diseases, and the second branch aims to learn features which can effectively +partition each class into several groups so as to preserve the sub-clustered +structure within each class. To achieve the objective of the second branch, we +present a cluster loss to learn image similarities via unsupervised clustering. +To ensure that the samples in each sub-cluster are from the same class, we +further design a purity loss to refine the unsupervised clustering results. We +evaluate the proposed approach on two public datasets for few-shot skin disease +classification. The experimental results validate that our framework +outperforms the state-of-the-art methods by around 2% to 5% in terms of +sensitivity, specificity, accuracy, and F1-score on the SD-198 and Derm7pt +datasets. + +
+
+ comment: Accepted by TNNLS 2023 +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: This paper integrates the works arXiv:2306.01129 and arXiv:2308.16271 + into a complete story. In this paper, we improve the writing and + organization, and also add conceptual, empirical, and theoretical + improvements over the previous work. V2: small typo fixes and formatting + improvements +
+
+
+
+
+ + ♻ ☆ CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal + Relationships + + +
+ Integrating deep learning and causal discovery has increased the +interpretability of Temporal Action Segmentation (TAS) tasks. However, +frame-level causal relationships exist many complicated noises outside the +segment-level, making it infeasible to directly express macro action semantics. +Thus, we propose Causal Abstraction Segmentation Refiner (CASR), which can +refine TAS results from various models by enhancing video causality in +marginalizing frame-level casual relationships. Specifically, we define the +equivalent frame-level casual model and segment-level causal model, so that the +causal adjacency matrix constructed from marginalized frame-level causal +relationships has the ability to represent the segmnet-level causal +relationships. CASR works out by reducing the difference in the causal +adjacency matrix between we constructed and pre-segmentation results of +backbone models. In addition, we propose a novel evaluation metric Causal Edit +Distance (CED) to evaluate the causal interpretability. Extensive experimental +results on mainstream datasets indicate that CASR significantly surpasses +existing various methods in action segmentation performance, as well as in +causal explainability and generalization. + +
+
+
+
+
+ + ♻ ☆ OVO: One-shot Vision Transformer Search with Online distillation + + +
+ Pure transformers have shown great potential for vision tasks recently. +However, their accuracy in small or medium datasets is not satisfactory. +Although some existing methods introduce a CNN as a teacher to guide the +training process by distillation, the gap between teacher and student networks +would lead to sub-optimal performance. In this work, we propose a new One-shot +Vision transformer search framework with Online distillation, namely OVO. OVO +samples sub-nets for both teacher and student networks for better distillation +results. Benefiting from the online distillation, thousands of subnets in the +supernet are well-trained without extra finetuning or retraining. In +experiments, OVO-Ti achieves 73.32% top-1 accuracy on ImageNet and 75.2% on +CIFAR-100, respectively. + +
+
+ comment: The work is not implemented +
+
+
+
+
+ + ♻ ☆ CIEM: Contrastive Instruction Evaluation Method for Better Instruction + Tuning + + +
+ Nowadays, the research on Large Vision-Language Models (LVLMs) has been +significantly promoted thanks to the success of Large Language Models (LLM). +Nevertheless, these Vision-Language Models (VLMs) are suffering from the +drawback of hallucination -- due to insufficient understanding of vision and +language modalities, VLMs may generate incorrect perception information when +doing downstream applications, for example, captioning a non-existent entity. +To address the hallucination phenomenon, on the one hand, we introduce a +Contrastive Instruction Evaluation Method (CIEM), which is an automatic +pipeline that leverages an annotated image-text dataset coupled with an LLM to +generate factual/contrastive question-answer pairs for the evaluation of the +hallucination of VLMs. On the other hand, based on CIEM, we further propose a +new instruction tuning method called CIT (the abbreviation of Contrastive +Instruction Tuning) to alleviate the hallucination of VLMs by automatically +producing high-quality factual/contrastive question-answer pairs and +corresponding justifications for model tuning. Through extensive experiments on +CIEM and CIT, we pinpoint the hallucination issues commonly present in existing +VLMs, the disability of the current instruction-tuning dataset to handle the +hallucination phenomenon and the superiority of CIT-tuned VLMs over both CIEM +and public datasets. + +
+
+
+
+
+ + ♻ ☆ Task-Robust Pre-Training for Worst-Case Downstream Adaptation + + +
+ Pre-training has achieved remarkable success when transferred to downstream +tasks. In machine learning, we care about not only the good performance of a +model but also its behavior under reasonable shifts of condition. The same +philosophy holds when pre-training a foundation model. However, the foundation +model may not uniformly behave well for a series of related downstream tasks. +This happens, for example, when conducting mask recovery regression where the +recovery ability or the training instances diverge like pattern features are +extracted dominantly on pre-training, but semantic features are also required +on a downstream task. This paper considers pre-training a model that guarantees +a uniformly good performance over the downstream tasks. We call this goal as +$\textit{downstream-task robustness}$. Our method first separates the upstream +task into several representative ones and applies a simple minimax loss for +pre-training. We then design an efficient algorithm to solve the minimax loss +and prove its convergence in the convex setting. In the experiments, we show +both on large-scale natural language processing and computer vision datasets +our method increases the metrics on worse-case downstream tasks. Additionally, +some theoretical explanations for why our loss is beneficial are provided. +Specifically, we show fewer samples are inherently required for the most +challenging downstream task in some cases. + +
+
+
+
+
+ + ♻ ☆ Cultural and Linguistic Diversity Improves Visual Representations + + +
+ Computer vision often treats perception as objective, and this assumption +gets reflected in the way that datasets are collected and models are trained. +For instance, image descriptions in different languages are typically assumed +to be translations of the same semantic content. However, work in +cross-cultural psychology and linguistics has shown that individuals differ in +their visual perception depending on their cultural background and the language +they speak. In this paper, we demonstrate significant differences in semantic +content across languages in both dataset and model-produced captions. When data +is multilingual as opposed to monolingual, captions have higher semantic +coverage on average, as measured by scene graph, embedding, and linguistic +complexity. For example, multilingual captions have on average 21.8% more +objects, 24.5% more relations, and 27.1% more attributes than a set of +monolingual captions. Moreover, models trained on content from different +languages perform best against test data from those languages, while those +trained on multilingual content perform consistently well across all evaluation +data compositions. Our research provides implications for how diverse modes of +perception can improve image understanding. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Disentangling of Facial Representations with 3D-aware + Latent Diffusion Models + + +
+ Unsupervised learning of facial representations has gained increasing +attention for face understanding ability without heavily relying on large-scale +annotated datasets. However, it remains unsolved due to the coupling of facial +identities, expressions, and external factors like pose and light. Prior +methods primarily focus on 2D factors and pixel-level consistency, leading to +incomplete disentangling and suboptimal performance in downstream tasks. In +this paper, we propose LatentFace, a novel unsupervised disentangling framework +for facial expression and identity representation. We suggest the disentangling +problem should be performed in latent space and propose the solution using a +3D-aware latent diffusion model. First, we introduce a 3D-aware autoencoder to +encode face images into 3D latent embeddings. Second, we propose a novel +representation diffusion model (RDM) to disentangle 3D latent into facial +identity and expression. Consequently, our method achieves state-of-the-art +performance in facial expression recognition and face verification among +unsupervised facial representation learning models. Codes are available at +\url{https://github.com/ryanhe312/LatentFace}. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Visual Acoustic Matching NeurIPS 2023 + + +
+ Acoustic matching aims to re-synthesize an audio clip to sound as if it were +recorded in a target acoustic environment. Existing methods assume access to +paired training data, where the audio is observed in both source and target +environments, but this limits the diversity of training data or requires the +use of simulated data or heuristics to create paired samples. We propose a +self-supervised approach to visual acoustic matching where training samples +include only the target scene image and audio -- without acoustically +mismatched source audio for reference. Our approach jointly learns to +disentangle room acoustics and re-synthesize audio into the target environment, +via a conditional GAN framework and a novel metric that quantifies the level of +residual acoustic information in the de-biased audio. Training with either +in-the-wild web data or simulated data, we demonstrate it outperforms the +state-of-the-art on multiple challenging datasets and a wide variety of +real-world audio and environments. + +
+
+ comment: Project page: https://vision.cs.utexas.edu/projects/ss_vam/ . + Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ MovieChat: From Dense Token to Sparse Memory for Long Video + Understanding + + +
+ Recently, integrating video foundation models and large language models to +build a video understanding system can overcome the limitations of specific +pre-defined vision tasks. Yet, existing systems can only handle videos with +very few frames. For long videos, the computation complexity, memory cost, and +long-term temporal connection impose additional challenges. Taking advantage of +the Atkinson-Shiffrin memory model, with tokens in Transformers being employed +as the carriers of memory in combination with our specially designed memory +mechanism, we propose the MovieChat to overcome these challenges. MovieChat +achieves state-of-the-art performance in long video understanding, along with +the released MovieChat-1K benchmark with 1K long video and 14K manual +annotations for validation of the effectiveness of our method. + +
+
+ comment: Preprint. Project Website https://rese1f.github.io/MovieChat/ +
+
+
+
+
+ + ♻ ☆ Learning Unsupervised World Models for Autonomous Driving via Discrete + Diffusion + + +
+ Learning world models can teach an agent how the world works in an +unsupervised manner. Even though it can be viewed as a special case of sequence +modeling, progress for scaling world models on robotic applications such as +autonomous driving has been somewhat less rapid than scaling language models +with Generative Pre-trained Transformers (GPT). We identify two reasons as +major bottlenecks: dealing with complex and unstructured observation space, and +having a scalable generative model. Consequently, we propose a novel world +modeling approach that first tokenizes sensor observations with VQVAE, then +predicts the future via discrete diffusion. To efficiently decode and denoise +tokens in parallel, we recast Masked Generative Image Transformer into the +discrete diffusion framework with a few simple changes, resulting in notable +improvement. When applied to learning world models on point cloud observations, +our model reduces prior SOTA Chamfer distance by more than 65% for 1s +prediction, and more than 50% for 3s prediction, across NuScenes, KITTI +Odometry, and Argoverse2 datasets. Our results demonstrate that discrete +diffusion on tokenized agent experience can unlock the power of GPT-like +unsupervised learning for robotic agents. + +
+
+
+
+
+
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ GPT Struct Me: Probing GPT Models on Narrative Entity Extraction + + +
+ The importance of systems that can extract structured information from +textual data becomes increasingly pronounced given the ever-increasing volume +of text produced on a daily basis. Having a system that can effectively extract +such information in an interoperable manner would be an asset for several +domains, be it finance, health, or legal. Recent developments in natural +language processing led to the production of powerful language models that can, +to some degree, mimic human intelligence. Such effectiveness raises a pertinent +question: Can these models be leveraged for the extraction of structured +information? In this work, we address this question by evaluating the +capabilities of two state-of-the-art language models -- GPT-3 and GPT-3.5, +commonly known as ChatGPT -- in the extraction of narrative entities, namely +events, participants, and temporal expressions. This study is conducted on the +Text2Story Lusa dataset, a collection of 119 Portuguese news articles whose +annotation framework includes a set of entity structures along with several +tags and attribute values. We first select the best prompt template through an +ablation study over prompt components that provide varying degrees of +information on a subset of documents of the dataset. Subsequently, we use the +best templates to evaluate the effectiveness of the models on the remaining +documents. The results obtained indicate that GPT models are competitive with +out-of-the-box baseline systems, presenting an all-in-one alternative for +practitioners with limited resources. By studying the strengths and limitations +of these models in the context of information extraction, we offer insights +that can guide future improvements and avenues to explore in this field. + +
+
+
+
+
+ + ☆ Benchmarking Robustness of Text-Image Composed Retrieval NeurIPS 2023 + + +
+ Text-image composed retrieval aims to retrieve the target image through the +composed query, which is specified in the form of an image plus some text that +describes desired modifications to the input image. It has recently attracted +attention due to its ability to leverage both information-rich images and +concise language to precisely express the requirements for target images. +However, the robustness of these approaches against real-world corruptions or +further text understanding has never been studied. In this paper, we perform +the first robustness study and establish three new diversified benchmarks for +systematic analysis of text-image composed retrieval against natural +corruptions in both vision and text and further probe textural understanding. +For natural corruption analysis, we introduce two new large-scale benchmark +datasets, CIRR-C and FashionIQ-C for testing in open domain and fashion domain +respectively, both of which apply 15 visual corruptions and 7 textural +corruptions. For textural understanding analysis, we introduce a new diagnostic +dataset CIRR-D by expanding the original raw data with synthetic data, which +contains modified text to better probe textual understanding ability including +numerical variation, attribute variation, object removal, background variation, +and fine-grained evaluation. The code and benchmark datasets are available at +https://github.com/SunTongtongtong/Benchmark-Robustness-Text-Image-Compose-Retrieval. + +
+
+ comment: Accepted by R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot + Learning in Foundation Models at NeurIPS 2023 +
+
+
+
+
+ + ☆ Anomaly detection in cross-country money transfer temporal networks + + +
+ During the last decades, Anti-Financial Crime (AFC) entities and Financial +Institutions have put a constantly increasing effort to reduce financial crime +and detect fraudulent activities, that are changing and developing in extremely +complex ways. We propose an anomaly detection approach based on network +analysis to help AFC officers navigating through the high load of information +that is typical of AFC data-driven scenarios. By experimenting on a large +financial dataset of more than 80M cross-country wire transfers, we leverage on +the properties of complex networks to develop a tool for explainable anomaly +detection, that can help in identifying outliers that could be engaged in +potentially malicious activities according to financial regulations. We +identify a set of network centrality measures that provide useful insights on +individual nodes; by keeping track of the evolution over time of the +centrality-based node rankings, we are able to highlight sudden and unexpected +changes in the roles of individual nodes that deserve further attention by AFC +officers. Such changes can hardly be noticed by means of current AFC practices, +that sometimes can lack a higher-level, global vision of the system. This +approach represents a preliminary step in the automation of AFC and AML +processes, serving the purpose of facilitating the work of AFC officers by +providing them with a top-down view of the picture emerging from financial +data. + +
+
+
+
+
+ + ♻ ☆ A Systematical Evaluation for Next-Basket Recommendation Algorithms + + +
+ Next basket recommender systems (NBRs) aim to recommend a user's next +(shopping) basket of items via modeling the user's preferences towards items +based on the user's purchase history, usually a sequence of historical baskets. +Due to its wide applicability in the real-world E-commerce industry, the +studies NBR have attracted increasing attention in recent years. NBRs have been +widely studied and much progress has been achieved in this area with a variety +of NBR approaches having been proposed. However, an important issue is that +there is a lack of a systematic and unified evaluation over the various NBR +approaches. Different studies often evaluate NBR approaches on different +datasets, under different experimental settings, making it hard to fairly and +effectively compare the performance of different NBR approaches. To bridge this +gap, in this work, we conduct a systematical empirical study in NBR area. +Specifically, we review the representative work in NBR and analyze their cons +and pros. Then, we run the selected NBR algorithms on the same datasets, under +the same experimental setting and evaluate their performances using the same +measurements. This provides a unified framework to fairly compare different NBR +approaches. We hope this study can provide a valuable reference for the future +research in this vibrant area. + +
+
+
+
+
+
+
+
+ + Machine Learning 107 + +
+
+
+ + ☆ Differentiable and accelerated spherical harmonic and Wigner transforms + + +
+ Many areas of science and engineering encounter data defined on spherical +manifolds. Modelling and analysis of spherical data often necessitates +spherical harmonic transforms, at high degrees, and increasingly requires +efficient computation of gradients for machine learning or other differentiable +programming tasks. We develop novel algorithmic structures for accelerated and +differentiable computation of generalised Fourier transforms on the sphere +$\mathbb{S}^2$ and rotation group $\text{SO}(3)$, i.e. spherical harmonic and +Wigner transforms, respectively. We present a recursive algorithm for the +calculation of Wigner $d$-functions that is both stable to high harmonic +degrees and extremely parallelisable. By tightly coupling this with separable +spherical transforms, we obtain algorithms that exhibit an extremely +parallelisable structure that is well-suited for the high throughput computing +of modern hardware accelerators (e.g. GPUs). We also develop a hybrid automatic +and manual differentiation approach so that gradients can be computed +efficiently. Our algorithms are implemented within the JAX differentiable +programming framework in the S2FFT software code. Numerous samplings of the +sphere are supported, including equiangular and HEALPix sampling. Computational +errors are at the order of machine precision for spherical samplings that admit +a sampling theorem. When benchmarked against alternative C codes we observe up +to a 400-fold acceleration. Furthermore, when distributing over multiple GPUs +we achieve very close to optimal linear scaling with increasing number of GPUs +due to the highly parallelised and balanced nature of our algorithms. Provided +access to sufficiently many GPUs our transforms thus exhibit an unprecedented +effective linear time complexity. + +
+
+ comment: 30 pages, 7 figures, code available at + https://github.com/astro-informatics/s2fft +
+
+
+
+
+ + ☆ Convergence Analysis for Learning Orthonormal Deep Linear Neural + Networks + + +
+ Enforcing orthonormal or isometric property for the weight matrices has been +shown to enhance the training of deep neural networks by mitigating gradient +exploding/vanishing and increasing the robustness of the learned networks. +However, despite its practical performance, the theoretical analysis of +orthonormality in neural networks is still lacking; for example, how +orthonormality affects the convergence of the training process. In this letter, +we aim to bridge this gap by providing convergence analysis for training +orthonormal deep linear neural networks. Specifically, we show that Riemannian +gradient descent with an appropriate initialization converges at a linear rate +for training orthonormal deep linear neural networks with a class of loss +functions. Unlike existing works that enforce orthonormal weight matrices for +all the layers, our approach excludes this requirement for one layer, which is +crucial to establish the convergence guarantee. Our results shed light on how +increasing the number of hidden layers can impact the convergence speed. +Experimental results validate our theoretical analysis. + +
+
+
+
+
+ + ☆ JetLOV: Enhancing Jet Tree Tagging through Neural Network Learning of + Optimal LundNet Variables NeurIPS 2023 + + +
+ Machine learning has played a pivotal role in advancing physics, with deep +learning notably contributing to solving complex classification problems such +as jet tagging in the field of jet physics. In this experiment, we aim to +harness the full potential of neural networks while acknowledging that, at +times, we may lose sight of the underlying physics governing these models. +Nevertheless, we demonstrate that we can achieve remarkable results obscuring +physics knowledge and relying completely on the model's outcome. We introduce +JetLOV, a composite comprising two models: a straightforward multilayer +perceptron (MLP) and the well-established LundNet. Our study reveals that we +can attain comparable jet tagging performance without relying on the +pre-computed LundNet variables. Instead, we allow the network to autonomously +learn an entirely new set of variables, devoid of a priori knowledge of the +underlying physics. These findings hold promise, particularly in addressing the +issue of model dependence, which can be mitigated through generalization and +training on diverse data sets. + +
+
+ comment: Accepted at the NeurIPS 2023 workshop: Machine Learning and the + Physical Sciences +
+
+
+
+
+ + ☆ Data-driven Prior Learning for Bayesian Optimisation NeurIPS 2023 + + +
+ Transfer learning for Bayesian optimisation has generally assumed a strong +similarity between optimisation tasks, with at least a subset having similar +optimal inputs. This assumption can reduce computational costs, but it is +violated in a wide range of optimisation problems where transfer learning may +nonetheless be useful. We replace this assumption with a weaker one only +requiring the shape of the optimisation landscape to be similar, and analyse +the recent method Prior Learning for Bayesian Optimisation - PLeBO - in this +setting. By learning priors for the hyperparameters of the Gaussian process +surrogate model we can better approximate the underlying function, especially +for few function evaluations. We validate the learned priors and compare to a +breadth of transfer learning approaches, using synthetic data and a recent air +pollution optimisation problem as benchmarks. We show that PLeBO and prior +transfer find good inputs in fewer evaluations. + +
+
+ comment: To be presented at the NeurIPS 2023 Workshop on Adaptive Experimental + Design and Active Learning in the Real World +
+
+
+
+
+ + ☆ One Pass Streaming Algorithm for Super Long Token Attention + Approximation in Sublinear Space + + +
+ Deploying Large Language Models (LLMs) in streaming applications that involve +long contexts, particularly for extended dialogues and text analysis, is of +paramount importance but presents two significant challenges. Firstly, the +memory consumption is substantial during the decoding phase due to the caching +of Key and Value states (KV) of previous tokens. Secondly, attention +computation is time-consuming with a time complexity of $O(n^2)$ for the +generation of each token. In recent OpenAI DevDay (Nov 6, 2023), OpenAI +released a new model that is able to support a 128K-long document, in our +paper, we focus on the memory-efficient issue when context length $n$ is much +greater than 128K ($n \gg 2^d$). Considering a single-layer self-attention with +Query, Key, and Value matrices $Q, K, V \in \mathbb{R}^{n \times d}$, the +polynomial method approximates the attention output $T \in \mathbb{R}^{n \times +d}$. It accomplishes this by constructing $U_1, U_2 \in \mathbb{R}^{n \times +t}$ to expedite attention ${\sf Attn}(Q, K, V)$ computation within $n^{1+o(1)}$ +time executions. Despite this, storing the Key and Value matrices $K, V \in +\mathbb{R}^{n \times d}$ still necessitates $O( n d)$ space, leading to +significant memory usage. In response to these challenges, we introduce a new +algorithm that only reads one pass of the data in streaming fashion. This +method employs sublinear space $o(n)$ to store three sketch matrices, +alleviating the need for exact $K, V$ storage. Notably, our algorithm exhibits +exceptional memory-efficient performance with super-long tokens. As the token +length $n$ increases, our error guarantee diminishes while the memory usage +remains nearly constant. This unique attribute underscores the potential of our +technique in efficiently handling LLMs in streaming applications. + +
+
+
+
+
+ + ☆ Learning in Deep Factor Graphs with Gaussian Belief Propagation + + +
+ We propose an approach to do learning in Gaussian factor graphs. We treat all +relevant quantities (inputs, outputs, parameters, latents) as random variables +in a graphical model, and view both training and prediction as inference +problems with different observed nodes. Our experiments show that these +problems can be efficiently solved with belief propagation (BP), whose updates +are inherently local, presenting exciting opportunities for distributed and +asynchronous training. Our approach can be scaled to deep networks and provides +a natural means to do continual learning: use the BP-estimated parameter +marginals of the current task as parameter priors for the next. On a video +denoising task we demonstrate the benefit of learnable parameters over a +classical factor graph approach and we show encouraging performance of deep +factor graphs for continual image classification on MNIST. + +
+
+
+
+
+ + ☆ More is Better in Modern Machine Learning: when Infinite + Overparameterization is Optimal and Overfitting is Obligatory + + +
+ In our era of enormous neural networks, empirical progress has been driven by +the philosophy that more is better. Recent deep learning practice has found +repeatedly that larger model size, more data, and more computation (resulting +in lower training loss) improves performance. In this paper, we give +theoretical backing to these empirical observations by showing that these three +properties hold in random feature (RF) regression, a class of models equivalent +to shallow networks with only the last layer trained. + Concretely, we first show that the test risk of RF regression decreases +monotonically with both the number of features and the number of samples, +provided the ridge penalty is tuned optimally. In particular, this implies that +infinite width RF architectures are preferable to those of any finite width. We +then proceed to demonstrate that, for a large class of tasks characterized by +powerlaw eigenstructure, training to near-zero training loss is obligatory: +near-optimal performance can only be achieved when the training error is much +smaller than the test error. Grounding our theory in real-world data, we find +empirically that standard computer vision tasks with convolutional neural +tangent kernels clearly fall into this class. Taken together, our results tell +a simple, testable story of the benefits of overparameterization, overfitting, +and more data in random feature models. + +
+
+
+
+
+ + ☆ A General Framework for User-Guided Bayesian Optimization + + +
+ The optimization of expensive-to-evaluate black-box functions is prevalent in +various scientific disciplines. Bayesian optimization is an automatic, general +and sample-efficient method to solve these problems with minimal knowledge of +the underlying function dynamics. However, the ability of Bayesian optimization +to incorporate prior knowledge or beliefs about the function at hand in order +to accelerate the optimization is limited, which reduces its appeal for +knowledgeable practitioners with tight budgets. To allow domain experts to +customize the optimization routine, we propose ColaBO, the first +Bayesian-principled framework for incorporating prior beliefs beyond the +typical kernel structure, such as the likely location of the optimizer or the +optimal value. The generality of ColaBO makes it applicable across different +Monte Carlo acquisition functions and types of user beliefs. We empirically +demonstrate ColaBO's ability to substantially accelerate optimization when the +prior information is accurate, and to retain approximately default performance +when it is misleading. + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ Differentially Private SGD Without Clipping Bias: An Error-Feedback + Approach + + +
+ Differentially Private Stochastic Gradient Descent with gradient clipping +(DPSGD-GC) is a powerful tool for training deep learning models using sensitive +data, providing both a solid theoretical privacy guarantee and high efficiency. +However, using DPSGD-GC to ensure Differential Privacy (DP) comes at the cost +of model performance degradation due to DP noise injection and gradient +clipping. Existing research has extensively analyzed the theoretical +convergence of DPSGD-GC, and has shown that it only converges when using large +clipping thresholds that are dependent on problem-specific parameters. +Unfortunately, these parameters are often unknown in practice, making it hard +to choose the optimal clipping threshold. Therefore, in practice, DPSGD-GC +suffers from degraded performance due to the {\it constant} bias introduced by +the clipping. + In our work, we propose a new error-feedback (EF) DP algorithm as an +alternative to DPSGD-GC, which not only offers a diminishing utility bound +without inducing a constant clipping bias, but more importantly, it allows for +an arbitrary choice of clipping threshold that is independent of the problem. +We establish an algorithm-specific DP analysis for our proposed algorithm, +providing privacy guarantees based on R{\'e}nyi DP. Additionally, we +demonstrate that under mild conditions, our algorithm can achieve nearly the +same utility bound as DPSGD without gradient clipping. Our empirical results on +Cifar-10/100 and E2E datasets, show that the proposed algorithm achieves higher +accuracies than DPSGD while maintaining the same level of DP guarantee. + +
+
+
+
+
+ + ☆ Analysis of the expected $L_2$ error of an over-parametrized deep neural + network estimate learned by gradient descent without regularization + + +
+ Recent results show that estimates defined by over-parametrized deep neural +networks learned by applying gradient descent to a regularized empirical $L_2$ +risk are universally consistent and achieve good rates of convergence. In this +paper, we show that the regularization term is not necessary to obtain similar +results. In the case of a suitably chosen initialization of the network, a +suitable number of gradient descent steps, and a suitable step size we show +that an estimate without a regularization term is universally consistent for +bounded predictor variables. Additionally, we show that if the regression +function is H\"older smooth with H\"older exponent $1/2 \leq p \leq 1$, the +$L_2$ error converges to zero with a convergence rate of approximately +$n^{-1/(1+d)}$. Furthermore, in case of an interaction model, where the +regression function consists of a sum of H\"older smooth functions with $d^*$ +components, a rate of convergence is derived which does not depend on the input +dimension $d$. + +
+
+
+
+
+ + ☆ A Metalearned Neural Circuit for Nonparametric Bayesian Inference + + +
+ Most applications of machine learning to classification assume a closed set +of balanced classes. This is at odds with the real world, where class +occurrence statistics often follow a long-tailed power-law distribution and it +is unlikely that all classes are seen in a single sample. Nonparametric +Bayesian models naturally capture this phenomenon, but have significant +practical barriers to widespread adoption, namely implementation complexity and +computational inefficiency. To address this, we present a method for extracting +the inductive bias from a nonparametric Bayesian model and transferring it to +an artificial neural network. By simulating data with a nonparametric Bayesian +prior, we can metalearn a sequence model that performs inference over an +unlimited set of classes. After training, this "neural circuit" has distilled +the corresponding inductive bias and can successfully perform sequential +inference over an open set of classes. Our experimental results show that the +metalearned neural circuit achieves comparable or better performance than +particle filter-based methods for inference in these models while being faster +and simpler to use than methods that explicitly incorporate Bayesian +nonparametric inference. + +
+
+ comment: 13 pages, 3 figures. Code available at + https://github.com/jakesnell/neural-circuits +
+
+
+
+
+ + ☆ Example-Based Explanations of Random Forest Predictions + + +
+ A random forest prediction can be computed by the scalar product of the +labels of the training examples and a set of weights that are determined by the +leafs of the forest into which the test object falls; each prediction can hence +be explained exactly by the set of training examples for which the weights are +non-zero. The number of examples used in such explanations is shown to vary +with the dimensionality of the training set and hyperparameters of the random +forest algorithm. This means that the number of examples involved in each +prediction can to some extent be controlled by varying these parameters. +However, for settings that lead to a required predictive performance, the +number of examples involved in each prediction may be unreasonably large, +preventing the user to grasp the explanations. In order to provide more useful +explanations, a modified prediction procedure is proposed, which includes only +the top-weighted examples. An investigation on regression and classification +tasks shows that the number of examples used in each explanation can be +substantially reduced while maintaining, or even improving, predictive +performance compared to the standard prediction procedure. + +
+
+ comment: Submitted to 22nd International Symposium on Intelligent Data + Analysis, IDA 2024 +
+
+
+
+
+ + ☆ Predicting Failure of P2P Lending Platforms through Machine Learning: + The Case in China + + +
+ This study employs machine learning models to predict the failure of +Peer-to-Peer (P2P) lending platforms, specifically in China. By employing the +filter method and wrapper method with forward selection and backward +elimination, we establish a rigorous and practical procedure that ensures the +robustness and importance of variables in predicting platform failures. The +research identifies a set of robust variables that consistently appear in the +feature subsets across different selection methods and models, suggesting their +reliability and relevance in predicting platform failures. The study highlights +that reducing the number of variables in the feature subset leads to an +increase in the false acceptance rate while the performance metrics remain +stable, with an AUC value of approximately 0.96 and an F1 score of around 0.88. +The findings of this research provide significant practical implications for +regulatory authorities and investors operating in the Chinese P2P lending +industry. + +
+
+
+
+
+ + ☆ FRUITS: Feature Extraction Using Iterated Sums for Time Series + Classification + + +
+ We introduce a pipeline for time series classification that extracts features +based on the iterated-sums signature (ISS) and then applies a linear +classifier. These features are intrinsically nonlinear, capture chronological +information, and, under certain settings, are invariant to time-warping. We are +competitive with state-of-the-art methods on the UCR archive, both in terms of +accuracy and speed. We make our code available at +\url{https://github.com/irkri/fruits}. + +
+
+
+
+
+ + ☆ Finding Foundation Models for Time Series Classification with a PreText + Task + + +
+ Over the past decade, Time Series Classification (TSC) has gained an +increasing attention. While various methods were explored, deep learning - +particularly through Convolutional Neural Networks (CNNs)-stands out as an +effective approach. However, due to the limited availability of training data, +defining a foundation model for TSC that overcomes the overfitting problem is +still a challenging task. The UCR archive, encompassing a wide spectrum of +datasets ranging from motion recognition to ECG-based heart disease detection, +serves as a prime example for exploring this issue in diverse TSC scenarios. In +this paper, we address the overfitting challenge by introducing pre-trained +domain foundation models. A key aspect of our methodology is a novel pretext +task that spans multiple datasets. This task is designed to identify the +originating dataset of each time series sample, with the goal of creating +flexible convolution filters that can be applied across different datasets. The +research process consists of two phases: a pre-training phase where the model +acquires general features through the pretext task, and a subsequent +fine-tuning phase for specific dataset classifications. Our extensive +experiments on the UCR archive demonstrate that this pre-training strategy +significantly outperforms the conventional training approach without +pre-training. This strategy effectively reduces overfitting in small datasets +and provides an efficient route for adapting these models to new datasets, thus +advancing the capabilities of deep learning in TSC. + +
+
+
+
+
+ + ☆ Comparing Feature Engineering and End-to-End Deep Learning for Autism + Spectrum Disorder Assessment based on Fullbody-Tracking + + +
+ Autism Spectrum Disorder (ASD) is characterized by challenges in social +communication and restricted patterns, with motor abnormalities gaining +traction for early detection. However, kinematic analysis in ASD is limited, +often lacking robust validation and relying on hand-crafted features for single +tasks, leading to inconsistencies across studies. Thus, end-to-end models have +become promising methods to overcome the need for feature engineering. Our aim +is to assess both approaches across various kinematic tasks to measure the +efficacy of commonly used features in ASD assessment, while comparing them to +end-to-end models. Specifically, we developed a virtual reality environment +with multiple motor tasks and trained models using both classification +approaches. We prioritized a reliable validation framework with repeated +cross-validation. Our comparative analysis revealed that hand-crafted features +outperformed our deep learning approach in specific tasks, achieving a +state-of-the-art area under the curve (AUC) of 0.90$\pm$0.06. Conversely, +end-to-end models provided more consistent results with less variability across +all VR tasks, demonstrating domain generalization and reliability, with a +maximum task AUC of 0.89$\pm$0.06. These findings show that end-to-end models +enable less variable and context-independent ASD assessments without requiring +domain knowledge or task specificity. However, they also recognize the +effectiveness of hand-crafted features in specific task scenarios. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ tinyCLAP: Distilling Constrastive Language-Audio Pretrained Models + + +
+ Contrastive Language-Audio Pretraining (CLAP) became of crucial importance in +the field of audio and speech processing. Its employment ranges from sound +event detection to text-to-audio generation. However, one of the main +limitations is the considerable amount of data required in the training process +and the overall computational complexity during inference. This paper +investigates how we can reduce the complexity of contrastive language-audio +pre-trained models, yielding an efficient model that we call tinyCLAP. We +derive an unimodal distillation loss from first principles and explore how the +dimensionality of the shared, multimodal latent space can be reduced via +pruning. TinyCLAP uses only 6% of the original Microsoft CLAP parameters with a +minimal reduction (less than 5%) in zero-shot classification performance across +the three sound event detection datasets on which it was tested + +
+
+
+
+
+ + ☆ StableSSM: Alleviating the Curse of Memory in State-space Models through + Stable Reparameterization + + +
+ In this paper, we investigate the long-term memory learning capabilities of +state-space models (SSMs) from the perspective of parameterization. We prove +that state-space models without any reparameterization exhibit a memory +limitation similar to that of traditional RNNs: the target relationships that +can be stably approximated by state-space models must have an exponential +decaying memory. Our analysis identifies this "curse of memory" as a result of +the recurrent weights converging to a stability boundary, suggesting that a +reparameterization technique can be effective. To this end, we introduce a +class of reparameterization techniques for SSMs that effectively lift its +memory limitations. Besides improving approximation capabilities, we further +illustrate that a principled choice of reparameterization scheme can also +enhance optimization stability. We validate our findings using synthetic +datasets and language models. + +
+
+
+
+
+ + ☆ Towards Interpretable Classification of Leukocytes based on Deep + Learning ICML 2023 + + +
+ Label-free approaches are attractive in cytological imaging due to their +flexibility and cost efficiency. They are supported by machine learning +methods, which, despite the lack of labeling and the associated lower contrast, +can classify cells with high accuracy where the human observer has little +chance to discriminate cells. In order to better integrate these workflows into +the clinical decision making process, this work investigates the calibration of +confidence estimation for the automated classification of leukocytes. In +addition, different visual explanation approaches are compared, which should +bring machine decision making closer to professional healthcare applications. +Furthermore, we were able to identify general detection patterns in neural +networks and demonstrate the utility of the presented approaches in different +scenarios of blood cell analysis. + +
+
+ comment: Presented at the 3rd Workshop on Interpretable Machine Learning in + Healthcare (IMLH) @ ICML 2023 +
+
+
+
+
+ + ☆ Fault Detection in Telecom Networks using Bi-level Federated Graph + Neural Networks ICDM 2023 + + +
+ 5G and Beyond Networks become increasingly complex and heterogeneous, with +diversified and high requirements from a wide variety of emerging applications. +The complexity and diversity of Telecom networks place an increasing strain on +maintenance and operation efforts. Moreover, the strict security and privacy +requirements present a challenge for mobile operators to leverage network data. +To detect network faults, and mitigate future failures, prior work focused on +leveraging traditional ML/DL methods to locate anomalies in networks. The +current approaches, although powerful, do not consider the intertwined nature +of embedded and software-intensive Radio Access Network systems. In this paper, +we propose a Bi-level Federated Graph Neural Network anomaly detection and +diagnosis model that is able to detect anomalies in Telecom networks in a +privacy-preserving manner, while minimizing communication costs. Our method +revolves around conceptualizing Telecom data as a bi-level temporal Graph +Neural Networks. The first graph captures the interactions between different +RAN nodes that are exposed to different deployment scenarios in the network, +while each individual Radio Access Network node is further elaborated into its +software (SW) execution graph. Additionally, we use Federated Learning to +address privacy and security limitations. Furthermore, we study the performance +of anomaly detection model under three settings: (1) Centralized (2) Federated +Learning and (3) Personalized Federated Learning using real-world data from an +operational network. Our comprehensive experiments showed that Personalized +Federated Temporal Graph Neural Networks method outperforms the most commonly +used techniques for Anomaly Detection. + +
+
+ comment: This paper has been accepted as part of the The 2nd International + Workshop on Federated Learning with Graph Data, colocated at EEE ICDM 2023 +
+
+
+
+
+ + ☆ Efficient Gradient Estimation via Adaptive Sampling and Importance + Sampling + + +
+ Machine learning problems rely heavily on stochastic gradient descent (SGD) +for optimization. The effectiveness of SGD is contingent upon accurately +estimating gradients from a mini-batch of data samples. Instead of the commonly +used uniform sampling, adaptive or importance sampling reduces noise in +gradient estimation by forming mini-batches that prioritize crucial data +points. Previous research has suggested that data points should be selected +with probabilities proportional to their gradient norm. Nevertheless, existing +algorithms have struggled to efficiently integrate importance sampling into +machine learning frameworks. In this work, we make two contributions. First, we +present an algorithm that can incorporate existing importance functions into +our framework. Second, we propose a simplified importance function that relies +solely on the loss gradient of the output layer. By leveraging our proposed +gradient estimation techniques, we observe improved convergence in +classification and regression tasks with minimal computational overhead. We +validate the effectiveness of our adaptive and importance-sampling approach on +image and point-cloud datasets. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ☆ Finite Volume Features, Global Geometry Representations, and Residual + Training for Deep Learning-based CFD Simulation + + +
+ Computational fluid dynamics (CFD) simulation is an irreplaceable modelling +step in many engineering designs, but it is often computationally expensive. +Some graph neural network (GNN)-based CFD methods have been proposed. However, +the current methods inherit the weakness of traditional numerical simulators, +as well as ignore the cell characteristics in the mesh used in the finite +volume method, a common method in practical CFD applications. Specifically, the +input nodes in these GNN methods have very limited information about any object +immersed in the simulation domain and its surrounding environment. Also, the +cell characteristics of the mesh such as cell volume, face surface area, and +face centroid are not included in the message-passing operations in the GNN +methods. To address these weaknesses, this work proposes two novel geometric +representations: Shortest Vector (SV) and Directional Integrated Distance +(DID). Extracted from the mesh, the SV and DID provide global geometry +perspective to each input node, thus removing the need to collect this +information through message-passing. This work also introduces the use of +Finite Volume Features (FVF) in the graph convolutions as node and edge +attributes, enabling its message-passing operations to adjust to different +nodes. Finally, this work is the first to demonstrate how residual training, +with the availability of low-resolution data, can be adopted to improve the +flow field prediction accuracy. Experimental results on two datasets with five +different state-of-the-art GNN methods for CFD indicate that SV, DID, FVF and +residual training can effectively reduce the predictive error of current +GNN-based methods by as much as 41%. + +
+
+
+
+
+ + ☆ Universal Jailbreak Backdoors from Poisoned Human Feedback + + +
+ Reinforcement Learning from Human Feedback (RLHF) is used to align large +language models to produce helpful and harmless responses. Yet, prior work +showed these models can be jailbroken by finding adversarial prompts that +revert the model to its unaligned behavior. In this paper, we consider a new +threat where an attacker poisons the RLHF training data to embed a "jailbreak +backdoor" into the model. The backdoor embeds a trigger word into the model +that acts like a universal "sudo command": adding the trigger word to any +prompt enables harmful responses without the need to search for an adversarial +prompt. Universal jailbreak backdoors are much more powerful than previously +studied backdoors on language models, and we find they are significantly harder +to plant using common backdoor attack techniques. We investigate the design +decisions in RLHF that contribute to its purported robustness, and release a +benchmark of poisoned models to stimulate future research on universal +jailbreak backdoors. + +
+
+
+
+
+ + ☆ Segment (Almost) Nothing: Prompt-Agnostic Adversarial Attacks on + Segmentation Models + + +
+ General purpose segmentation models are able to generate (semantic) +segmentation masks from a variety of prompts, including visual (points, boxed, +etc.) and textual (object names) ones. In particular, input images are +pre-processed by an image encoder to obtain embedding vectors which are later +used for mask predictions. Existing adversarial attacks target the end-to-end +tasks, i.e. aim at altering the segmentation mask predicted for a specific +image-prompt pair. However, this requires running an individual attack for each +new prompt for the same image. We propose instead to generate prompt-agnostic +adversarial attacks by maximizing the $\ell_2$-distance, in the latent space, +between the embedding of the original and perturbed images. Since the encoding +process only depends on the image, distorted image representations will cause +perturbations in the segmentation masks for a variety of prompts. We show that +even imperceptible $\ell_\infty$-bounded perturbations of radius +$\epsilon=1/255$ are often sufficient to drastically modify the masks predicted +with point, box and text prompts by recently proposed foundation models for +segmentation. Moreover, we explore the possibility of creating universal, i.e. +non image-specific, attacks which can be readily applied to any input without +further computational cost. + +
+
+
+
+
+ + ☆ Disentangling the Spectral Properties of the Hodge Laplacian: Not All + Small Eigenvalues Are Equal + + +
+ The rich spectral information of the graph Laplacian has been instrumental in +graph theory, machine learning, and graph signal processing for applications +such as graph classification, clustering, or eigenmode analysis. Recently, the +Hodge Laplacian has come into focus as a generalisation of the ordinary +Laplacian for higher-order graph models such as simplicial and cellular +complexes. Akin to the traditional analysis of graph Laplacians, many authors +analyse the smallest eigenvalues of the Hodge Laplacian, which are connected to +important topological properties such as homology. However, small eigenvalues +of the Hodge Laplacian can carry different information depending on whether +they are related to curl or gradient eigenmodes, and thus may not be +comparable. We therefore introduce the notion of persistent eigenvector +similarity and provide a method to track individual harmonic, curl, and +gradient eigenvectors/-values through the so-called persistence filtration, +leveraging the full information contained in the Hodge-Laplacian spectrum +across all possible scales of a point cloud. Finally, we use our insights (a) +to introduce a novel form of topological spectral clustering and (b) to +classify edges and higher-order simplices based on their relationship to the +smallest harmonic, curl, and gradient eigenvectors. + +
+
+ comment: 5 pages, 4 figures, comments welcome +
+
+
+
+
+ + ☆ Approximation of Convex Envelope Using Reinforcement Learning + + +
+ Oberman gave a stochastic control formulation of the problem of estimating +the convex envelope of a non-convex function. Based on this, we develop a +reinforcement learning scheme to approximate the convex envelope, using a +variant of Q-learning for controlled optimal stopping. It shows very promising +results on a standard library of test problems. + +
+
+
+
+
+ + ☆ A Comparison of PDF Projection with Normalizing Flows and SurVAE + + +
+ Normalizing flows (NF) recently gained attention as a way to construct +generative networks with exact likelihood calculation out of composable layers. +However, NF is restricted to dimension-preserving transformations. Surjection +VAE (SurVAE) has been proposed to extend NF to dimension-altering +transformations. Such networks are desirable because they are expressive and +can be precisely trained. We show that the approaches are a re-invention of PDF +projection, which appeared over twenty years earlier and is much further +developed. + +
+
+
+
+
+ + ☆ Unveiling The Factors of Aesthetic Preferences with Explainable AI + + +
+ The allure of aesthetic appeal in images captivates our senses, yet the +underlying intricacies of aesthetic preferences remain elusive. In this study, +we pioneer a novel perspective by utilizing machine learning models that focus +on aesthetic attributes known to influence preferences. Through a data mining +approach, our models process these attributes as inputs to predict the +aesthetic scores of images. Moreover, to delve deeper and obtain interpretable +explanations regarding the factors driving aesthetic preferences, we utilize +the popular Explainable AI (XAI) technique known as SHapley Additive +exPlanations (SHAP). Our methodology involves employing various machine +learning models, including Random Forest, XGBoost, Support Vector Regression, +and Multilayer Perceptron, to compare their performances in accurately +predicting aesthetic scores, and consistently observing results in conjunction +with SHAP. We conduct experiments on three image aesthetic benchmarks, +providing insights into the roles of attributes and their interactions. +Ultimately, our study aims to shed light on the complex nature of aesthetic +preferences in images through machine learning and provides a deeper +understanding of the attributes that influence aesthetic judgements. + +
+
+
+
+
+ + ☆ LLamol: A Dynamic Multi-Conditional Generative Transformer for De Novo + Molecular Design + + +
+ Generative models have demonstrated substantial promise in Natural Language +Processing (NLP) and have found application in designing molecules, as seen in +General Pretrained Transformer (GPT) models. In our efforts to develop such a +tool for exploring the organic chemical space in search of potentially +electro-active compounds, we present "LLamol", a single novel generative +transformer model based on the LLama 2 architecture, which was trained on a 13M +superset of organic compounds drawn from diverse public sources. To allow for a +maximum flexibility in usage and robustness in view of potentially incomplete +data, we introduce "Stochastic Context Learning" as a new training procedure. +We demonstrate that the resulting model adeptly handles single- and +multi-conditional organic molecule generation with up to four conditions, yet +more are possible. The model generates valid molecular structures in SMILES +notation while flexibly incorporating three numerical and/or one token sequence +into the generative process, just as requested. The generated compounds are +very satisfactory in all scenarios tested. In detail, we showcase the model's +capability to utilize token sequences for conditioning, either individually or +in combination with numerical properties, making LLamol a potent tool for de +novo molecule design, easily expandable with new properties. + +
+
+
+
+
+ + ☆ BHGNN-RT: Network embedding for directed heterogeneous graphs + + +
+ Networks are one of the most valuable data structures for modeling problems +in the real world. However, the most recent node embedding strategies have +focused on undirected graphs, with limited attention to directed graphs, +especially directed heterogeneous graphs. In this study, we first investigated +the network properties of directed heterogeneous graphs. Based on network +analysis, we proposed an embedding method, a bidirectional heterogeneous graph +neural network with random teleport (BHGNN-RT), for directed heterogeneous +graphs, that leverages bidirectional message-passing process and network +heterogeneity. With the optimization of teleport proportion, BHGNN-RT is +beneficial to overcome the over-smoothing problem. Extensive experiments on +various datasets were conducted to verify the efficacy and efficiency of +BHGNN-RT. Furthermore, we investigated the effects of message components, model +layer, and teleport proportion on model performance. The performance comparison +with all other baselines illustrates that BHGNN-RT achieves state-of-the-art +performance, outperforming the benchmark methods in both node classification +and unsupervised clustering tasks. + +
+
+
+
+
+ + ☆ TEA: Test-time Energy Adaptation + + +
+ Test-time adaptation (TTA) aims to improve model generalizability when test +data diverges from training distribution, offering the distinct advantage of +not requiring access to training data and processes, especially valuable in the +context of large pre-trained models. However, current TTA methods fail to +address the fundamental issue: covariate shift, i.e., the decreased +generalizability can be attributed to the model's reliance on the marginal +distribution of the training data, which may impair model calibration and +introduce confirmation bias. To address this, we propose a novel energy-based +perspective, enhancing the model's perception of target data distributions +without requiring access to training data or processes. Building on this +perspective, we introduce $\textbf{T}$est-time $\textbf{E}$nergy +$\textbf{A}$daptation ($\textbf{TEA}$), which transforms the trained classifier +into an energy-based model and aligns the model's distribution with the test +data's, enhancing its ability to perceive test distributions and thus improving +overall generalizability. Extensive experiments across multiple tasks, +benchmarks and architectures demonstrate TEA's superior generalization +performance against state-of-the-art methods. Further in-depth analyses reveal +that TEA can equip the model with a comprehensive perception of test +distribution, ultimately paving the way toward improved generalization and +calibration. + +
+
+ comment: 16 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ Multi-scale Semantic Correlation Mining for Visible-Infrared Person + Re-Identification + + +
+ The main challenge in the Visible-Infrared Person Re-Identification (VI-ReID) +task lies in how to extract discriminative features from different modalities +for matching purposes. While the existing well works primarily focus on +minimizing the modal discrepancies, the modality information can not thoroughly +be leveraged. To solve this problem, a Multi-scale Semantic Correlation Mining +network (MSCMNet) is proposed to comprehensively exploit semantic features at +multiple scales and simultaneously reduce modality information loss as small as +possible in feature extraction. The proposed network contains three novel +components. Firstly, after taking into account the effective utilization of +modality information, the Multi-scale Information Correlation Mining Block +(MIMB) is designed to explore semantic correlations across multiple scales. +Secondly, in order to enrich the semantic information that MIMB can utilize, a +quadruple-stream feature extractor (QFE) with non-shared parameters is +specifically designed to extract information from different dimensions of the +dataset. Finally, the Quadruple Center Triplet Loss (QCT) is further proposed +to address the information discrepancy in the comprehensive features. Extensive +experiments on the SYSU-MM01, RegDB, and LLCM datasets demonstrate that the +proposed MSCMNet achieves the greatest accuracy. + +
+
+
+
+
+ + ☆ Directly Attention Loss Adjusted Prioritized Experience Replay + + +
+ Prioritized Experience Replay (PER) enables the model to learn more about +relatively important samples by artificially changing their accessed +frequencies. However, this non-uniform sampling method shifts the state-action +distribution that is originally used to estimate Q-value functions, which +brings about the estimation deviation. In this article, an novel off policy +reinforcement learning training framework called Directly Attention Loss +Adjusted Prioritized Experience Replay (DALAP) is proposed, which can directly +quantify the changed extent of the shifted distribution through Parallel +Self-Attention network, so as to accurately compensate the error. In addition, +a Priority-Encouragement mechanism is designed simultaneously to optimize the +sample screening criterion, and further improve the training efficiency. In +order to verify the effectiveness and generality of DALAP, we integrate it with +the value-function based, the policy-gradient based and multi-agent +reinforcement learning algorithm, respectively. The multiple groups of +comparative experiments show that DALAP has the significant advantages of both +improving the convergence rate and reducing the training variance. + +
+
+
+
+
+ + ☆ A Parameterized Generative Adversarial Network Using Cyclic Projection + for Explainable Medical Image Classification + + +
+ Although current data augmentation methods are successful to alleviate the +data insufficiency, conventional augmentation are primarily intra-domain while +advanced generative adversarial networks (GANs) generate images remaining +uncertain, particularly in small-scale datasets. In this paper, we propose a +parameterized GAN (ParaGAN) that effectively controls the changes of synthetic +samples among domains and highlights the attention regions for downstream +classification. Specifically, ParaGAN incorporates projection distance +parameters in cyclic projection and projects the source images to the decision +boundary to obtain the class-difference maps. Our experiments show that ParaGAN +can consistently outperform the existing augmentation methods with explainable +classification on two small-scale medical datasets. + +
+
+ comment: 5 pages, 4 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Achieving Margin Maximization Exponentially Fast via Progressive Norm + Rescaling + + +
+ In this work, we investigate the margin-maximization bias exhibited by +gradient-based algorithms in classifying linearly separable data. We present an +in-depth analysis of the specific properties of the velocity field associated +with (normalized) gradients, focusing on their role in margin maximization. +Inspired by this analysis, we propose a novel algorithm called Progressive +Rescaling Gradient Descent (PRGD) and show that PRGD can maximize the margin at +an {\em exponential rate}. This stands in stark contrast to all existing +algorithms, which maximize the margin at a slow {\em polynomial rate}. +Specifically, we identify mild conditions on data distribution under which +existing algorithms such as gradient descent (GD) and normalized gradient +descent (NGD) {\em provably fail} in maximizing the margin efficiently. To +validate our theoretical findings, we present both synthetic and real-world +experiments. Notably, PRGD also shows promise in enhancing the generalization +performance when applied to linearly non-separable datasets and deep neural +networks. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ☆ Federated Transformed Learning for a Circular, Secure, and Tiny AI + + +
+ Deep Learning (DL) is penetrating into a diverse range of mass mobility, +smart living, and industrial applications, rapidly transforming the way we live +and work. DL is at the heart of many AI implementations. A key set of +challenges is to produce AI modules that are: (1) "circular" - can solve new +tasks without forgetting how to solve previous ones, (2) "secure" - have +immunity to adversarial data attacks, and (3) "tiny" - implementable in low +power low cost embedded hardware. Clearly it is difficult to achieve all three +aspects on a single horizontal layer of platforms, as the techniques require +transformed deep representations that incur different computation and +communication requirements. Here we set out the vision to achieve transformed +DL representations across a 5G and Beyond networked architecture. We first +detail the cross-sectoral motivations for each challenge area, before +demonstrating recent advances in DL research that can achieve circular, secure, +and tiny AI (CST-AI). Recognising the conflicting demand of each transformed +deep representation, we federate their deep learning transformations and +functionalities across the network to achieve connected run-time capabilities. + +
+
+
+
+
+ + ☆ Deciphering and integrating invariants for neural operator learning with + various physical mechanisms + + +
+ Neural operators have been explored as surrogate models for simulating +physical systems to overcome the limitations of traditional partial +differential equation (PDE) solvers. However, most existing operator learning +methods assume that the data originate from a single physical mechanism, +limiting their applicability and performance in more realistic scenarios. To +this end, we propose Physical Invariant Attention Neural Operator (PIANO) to +decipher and integrate the physical invariants (PI) for operator learning from +the PDE series with various physical mechanisms. PIANO employs self-supervised +learning to extract physical knowledge and attention mechanisms to integrate +them into dynamic convolutional layers. Compared to existing techniques, PIANO +can reduce the relative error by 13.6\%-82.2\% on PDE forecasting tasks across +varying coefficients, forces, or boundary conditions. Additionally, varied +downstream tasks reveal that the PI embeddings deciphered by PIANO align well +with the underlying invariants in the PDE systems, verifying the physical +significance of PIANO. The source code will be publicly available at: +https://github.com/optray/PIANO. + +
+
+
+
+
+ + ☆ Thompson sampling for zero-inflated count outcomes with an application + to the Drink Less mobile health study + + +
+ Mobile health (mHealth) technologies aim to improve distal outcomes, such as +clinical conditions, by optimizing proximal outcomes through just-in-time +adaptive interventions. Contextual bandits provide a suitable framework for +customizing such interventions according to individual time-varying contexts, +intending to maximize cumulative proximal outcomes. However, unique challenges +such as modeling count outcomes within bandit frameworks have hindered the +widespread application of contextual bandits to mHealth studies. The current +work addresses this challenge by leveraging count data models into online +decision-making approaches. Specifically, we combine four common offline count +data models (Poisson, negative binomial, zero-inflated Poisson, and +zero-inflated negative binomial regressions) with Thompson sampling, a popular +contextual bandit algorithm. The proposed algorithms are motivated by and +evaluated on a real dataset from the Drink Less trial, where they are shown to +improve user engagement with the mHealth system. The proposed methods are +further evaluated on simulated data, achieving improvement in maximizing +cumulative proximal outcomes over existing algorithms. Theoretical results on +regret bounds are also derived. A user-friendly R package countts that +implements the proposed methods for assessing contextual bandit algorithms is +made publicly available at https://cran.r-project.org/web/packages/countts. + +
+
+
+
+
+ + ☆ Comparative Analysis of Transformers for Modeling Tabular Data: A + Casestudy using Industry Scale Dataset KDD + + +
+ We perform a comparative analysis of transformer-based models designed for +modeling tabular data, specifically on an industry-scale dataset. While earlier +studies demonstrated promising outcomes on smaller public or synthetic +datasets, the effectiveness did not extend to larger industry-scale datasets. +The challenges identified include handling high-dimensional data, the necessity +for efficient pre-processing of categorical and numerical features, and +addressing substantial computational requirements. + To overcome the identified challenges, the study conducts an extensive +examination of various transformer-based models using both synthetic datasets +and the default prediction Kaggle dataset (2022) from American Express. The +paper presents crucial insights into optimal data pre-processing, compares +pre-training and direct supervised learning methods, discusses strategies for +managing categorical and numerical features, and highlights trade-offs between +computational resources and performance. Focusing on temporal financial data +modeling, the research aims to facilitate the systematic development and +deployment of transformer-based models in real-world scenarios, emphasizing +scalability. + +
+
+ comment: Accepted at 7th Joint International Conference on Data Science & + Management of Data (11th ACMIKDD CODS and 29th COMAD) +
+
+
+
+
+ + ☆ Cycle Invariant Positional Encoding for Graph Representation Learning + + +
+ Cycles are fundamental elements in graph-structured data and have +demonstrated their effectiveness in enhancing graph learning models. To encode +such information into a graph learning framework, prior works often extract a +summary quantity, ranging from the number of cycles to the more sophisticated +persistence diagram summaries. However, more detailed information, such as +which edges are encoded in a cycle, has not yet been used in graph neural +networks. In this paper, we make one step towards addressing this gap, and +propose a structure encoding module, called CycleNet, that encodes cycle +information via edge structure encoding in a permutation invariant manner. To +efficiently encode the space of all cycles, we start with a cycle basis (i.e., +a minimal set of cycles generating the cycle space) which we compute via the +kernel of the 1-dimensional Hodge Laplacian of the input graph. To guarantee +the encoding is invariant w.r.t. the choice of cycle basis, we encode the cycle +information via the orthogonal projector of the cycle basis, which is inspired +by BasisNet proposed by Lim et al. We also develop a more efficient variant +which however requires that the input graph has a unique shortest cycle basis. +To demonstrate the effectiveness of the proposed module, we provide some +theoretical understandings of its expressive power. Moreover, we show via a +range of experiments that networks enhanced by our CycleNet module perform +better in various benchmarks compared to several existing SOTA models. + +
+
+ comment: Accepted as oral presentation in the Learning on Graphs Conference + (LoG 2023) +
+
+
+
+
+ + ☆ GATGPT: A Pre-trained Large Language Model with Graph Attention Network + for Spatiotemporal Imputation + + +
+ The analysis of spatiotemporal data is increasingly utilized across diverse +domains, including transportation, healthcare, and meteorology. In real-world +settings, such data often contain missing elements due to issues like sensor +malfunctions and data transmission errors. The objective of spatiotemporal +imputation is to estimate these missing values by understanding the inherent +spatial and temporal relationships in the observed multivariate time series. +Traditionally, spatiotemporal imputation has relied on specific, intricate +architectures designed for this purpose, which suffer from limited +applicability and high computational complexity. In contrast, our approach +integrates pre-trained large language models (LLMs) into spatiotemporal +imputation, introducing a groundbreaking framework, GATGPT. This framework +merges a graph attention mechanism with LLMs. We maintain most of the LLM +parameters unchanged to leverage existing knowledge for learning temporal +patterns, while fine-tuning the upper layers tailored to various applications. +The graph attention component enhances the LLM's ability to understand spatial +relationships. Through tests on three distinct real-world datasets, our +innovative approach demonstrates comparable results to established deep +learning benchmarks. + +
+
+
+
+
+ + ☆ Large Language Models as Topological Structure Enhancers for + Text-Attributed Graphs + + +
+ The latest advancements in large language models (LLMs) have revolutionized +the field of natural language processing (NLP). Inspired by the success of LLMs +in NLP tasks, some recent work has begun investigating the potential of +applying LLMs in graph learning tasks. However, most of the existing work +focuses on utilizing LLMs as powerful node feature augmenters, leaving +employing LLMs to enhance graph topological structures an understudied problem. +In this work, we explore how to leverage the information retrieval and text +generation capabilities of LLMs to refine/enhance the topological structure of +text-attributed graphs (TAGs) under the node classification setting. First, we +propose using LLMs to help remove unreliable edges and add reliable ones in the +TAG. Specifically, we first let the LLM output the semantic similarity between +node attributes through delicate prompt designs, and then perform edge deletion +and edge addition based on the similarity. Second, we propose using +pseudo-labels generated by the LLM to improve graph topology, that is, we +introduce the pseudo-label propagation as a regularization to guide the graph +neural network (GNN) in learning proper edge weights. Finally, we incorporate +the two aforementioned LLM-based methods for graph topological refinement into +the process of GNN training, and perform extensive experiments on four +real-world datasets. The experimental results demonstrate the effectiveness of +LLM-based graph topology refinement (achieving a 0.15%--2.47% performance gain +on public benchmarks). + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ New Epochs in AI Supervision: Design and Implementation of an Autonomous + Radiology AI Monitoring System + + +
+ With the increasingly widespread adoption of AI in healthcare, maintaining +the accuracy and reliability of AI models in clinical practice has become +crucial. In this context, we introduce novel methods for monitoring the +performance of radiology AI classification models in practice, addressing the +challenges of obtaining real-time ground truth for performance monitoring. We +propose two metrics - predictive divergence and temporal stability - to be used +for preemptive alerts of AI performance changes. Predictive divergence, +measured using Kullback-Leibler and Jensen-Shannon divergences, evaluates model +accuracy by comparing predictions with those of two supplementary models. +Temporal stability is assessed through a comparison of current predictions +against historical moving averages, identifying potential model decay or data +drift. This approach was retrospectively validated using chest X-ray data from +a single-center imaging clinic, demonstrating its effectiveness in maintaining +AI model reliability. By providing continuous, real-time insights into model +performance, our system ensures the safe and effective use of AI in clinical +decision-making, paving the way for more robust AI integration in healthcare + +
+
+ comment: 10 pages, 4 figures, 2 tables +
+
+
+
+
+ + ☆ AdaMedGraph: Adaboosting Graph Neural Networks for Personalized Medicine ML4H + + +
+ Precision medicine tailored to individual patients has gained significant +attention in recent times. Machine learning techniques are now employed to +process personalized data from various sources, including images, genetics, and +assessments. These techniques have demonstrated good outcomes in many clinical +prediction tasks. Notably, the approach of constructing graphs by linking +similar patients and then applying graph neural networks (GNNs) stands out, +because related information from analogous patients are aggregated and +considered for prediction. However, selecting the appropriate edge feature to +define patient similarity and construct the graph is challenging, given that +each patient is depicted by high-dimensional features from diverse sources. +Previous studies rely on human expertise to select the edge feature, which is +neither scalable nor efficient in pinpointing crucial edge features for complex +diseases. In this paper, we propose a novel algorithm named \ours, which can +automatically select important features to construct multiple patient +similarity graphs, and train GNNs based on these graphs as weak learners in +adaptive boosting. \ours{} is evaluated on two real-world medical scenarios and +shows superiors performance. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 9 pages +
+
+
+
+
+ + ☆ GeoViT: A Versatile Vision Transformer Architecture for Geospatial Image + Analysis + + +
+ Greenhouse gases are pivotal drivers of climate change, necessitating precise +quantification and source identification to foster mitigation strategies. We +introduce GeoViT, a compact vision transformer model adept in processing +satellite imagery for multimodal segmentation, classification, and regression +tasks targeting CO2 and NO2 emissions. Leveraging GeoViT, we attain superior +accuracy in estimating power generation rates, fuel type, plume coverage for +CO2, and high-resolution NO2 concentration mapping, surpassing previous +state-of-the-art models while significantly reducing model size. GeoViT +demonstrates the efficacy of vision transformer architectures in harnessing +satellite-derived data for enhanced GHG emission insights, proving instrumental +in advancing climate change monitoring and emission regulation efforts +globally. + +
+
+ comment: Extended Abstract, Preprint +
+
+
+
+
+ + ☆ CRISP: Hybrid Structured Sparsity for Class-aware Model Pruning DATE + + +
+ Machine learning pipelines for classification tasks often train a universal +model to achieve accuracy across a broad range of classes. However, a typical +user encounters only a limited selection of classes regularly. This disparity +provides an opportunity to enhance computational efficiency by tailoring models +to focus on user-specific classes. Existing works rely on unstructured pruning, +which introduces randomly distributed non-zero values in the model, making it +unsuitable for hardware acceleration. Alternatively, some approaches employ +structured pruning, such as channel pruning, but these tend to provide only +minimal compression and may lead to reduced model accuracy. In this work, we +propose CRISP, a novel pruning framework leveraging a hybrid structured +sparsity pattern that combines both fine-grained N:M structured sparsity and +coarse-grained block sparsity. Our pruning strategy is guided by a +gradient-based class-aware saliency score, allowing us to retain weights +crucial for user-specific classes. CRISP achieves high accuracy with minimal +memory consumption for popular models like ResNet-50, VGG-16, and MobileNetV2 +on ImageNet and CIFAR-100 datasets. Moreover, CRISP delivers up to 14$\times$ +reduction in latency and energy consumption compared to existing pruning +methods while maintaining comparable accuracy. Our code is available at +https://github.com/shivmgg/CRISP/. + +
+
+ comment: 6 pages, accepted in Design, Automation & Test in Europe Conference & + Exhibition (DATE) 2024 +
+
+
+
+
+ + ☆ Segmentation-Based Parametric Painting + + +
+ We introduce a novel image-to-painting method that facilitates the creation +of large-scale, high-fidelity paintings with human-like quality and stylistic +variation. To process large images and gain control over the painting process, +we introduce a segmentation-based painting process and a dynamic attention map +approach inspired by human painting strategies, allowing optimization of brush +strokes to proceed in batches over different image regions, thereby capturing +both large-scale structure and fine details, while also allowing stylistic +control over detail. Our optimized batch processing and patch-based loss +framework enable efficient handling of large canvases, ensuring our painted +outputs are both aesthetically compelling and functionally superior as compared +to previous methods, as confirmed by rigorous evaluations. Code available at: +https://github.com/manuelladron/semantic\_based\_painting.git + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Out-of-Distribution Generalized Dynamic Graph Neural Network with + Disentangled Intervention and Invariance Promotion + + +
+ Dynamic graph neural networks (DyGNNs) have demonstrated powerful predictive +abilities by exploiting graph structural and temporal dynamics. However, the +existing DyGNNs fail to handle distribution shifts, which naturally exist in +dynamic graphs, mainly because the patterns exploited by DyGNNs may be variant +with respect to labels under distribution shifts. In this paper, we propose +Disentangled Intervention-based Dynamic graph Attention networks with +Invariance Promotion (I-DIDA) to handle spatio-temporal distribution shifts in +dynamic graphs by discovering and utilizing invariant patterns, i.e., +structures and features whose predictive abilities are stable across +distribution shifts. Specifically, we first propose a disentangled +spatio-temporal attention network to capture the variant and invariant +patterns. By utilizing the disentangled patterns, we design a spatio-temporal +intervention mechanism to create multiple interventional distributions and an +environment inference module to infer the latent spatio-temporal environments, +and minimize the variance of predictions among these intervened distributions +and environments, so that our model can make predictions based on invariant +patterns with stable predictive abilities under distribution shifts. Extensive +experiments demonstrate the superiority of our method over state-of-the-art +baselines under distribution shifts. Our work is the first study of +spatio-temporal distribution shifts in dynamic graphs, to the best of our +knowledge. + +
+
+
+
+
+ + ☆ Pseudo-label Correction for Instance-dependent Noise Using + Teacher-student Framework + + +
+ The high capacity of deep learning models to learn complex patterns poses a +significant challenge when confronted with label noise. The inability to +differentiate clean and noisy labels ultimately results in poor generalization. +We approach this problem by reassigning the label for each image using a new +teacher-student based framework termed P-LC (pseudo-label correction). +Traditional teacher-student networks are composed of teacher and student +classifiers for knowledge distillation. In our novel approach, we reconfigure +the teacher network into a triple encoder, leveraging the triplet loss to +establish a pseudo-label correction system. As the student generates pseudo +labels for a set of given images, the teacher learns to choose between the +initially assigned labels and the pseudo labels. Experiments on MNIST, +Fashion-MNIST, and SVHN demonstrate P-LC's superior performance over existing +state-of-the-art methods across all noise levels, most notably in high noise. +In addition, we introduce a noise level estimation to help assess model +performance and inform the need for additional data cleaning procedures. + +
+
+
+
+
+ + ♻ ☆ Visual Dexterity: In-Hand Reorientation of Novel and Complex Object + Shapes + + +
+ In-hand object reorientation is necessary for performing many dexterous +manipulation tasks, such as tool use in less structured environments that +remain beyond the reach of current robots. Prior works built reorientation +systems assuming one or many of the following: reorienting only specific +objects with simple shapes, limited range of reorientation, slow or quasistatic +manipulation, simulation-only results, the need for specialized and costly +sensor suites, and other constraints which make the system infeasible for +real-world deployment. We present a general object reorientation controller +that does not make these assumptions. It uses readings from a single commodity +depth camera to dynamically reorient complex and new object shapes by any +rotation in real-time, with the median reorientation time being close to seven +seconds. The controller is trained using reinforcement learning in simulation +and evaluated in the real world on new object shapes not used for training, +including the most challenging scenario of reorienting objects held in the air +by a downward-facing hand that must counteract gravity during reorientation. +Our hardware platform only uses open-source components that cost less than five +thousand dollars. Although we demonstrate the ability to overcome assumptions +in prior work, there is ample scope for improving absolute performance. For +instance, the challenging duck-shaped object not used for training was dropped +in 56 percent of the trials. When it was not dropped, our controller reoriented +the object within 0.4 radians (23 degrees) 75 percent of the time. Videos are +available at: https://taochenshh.github.io/projects/visual-dexterity. + +
+
+ comment: Published in Science Robotics: + https://www.science.org/doi/10.1126/scirobotics.adc9244 +
+
+
+
+
+ + ♻ ☆ A path-norm toolkit for modern networks: consequences, promises and + challenges + + +
+ This work introduces the first toolkit around path-norms that is fully able +to encompass general DAG ReLU networks with biases, skip connections and any +operation based on the extraction of order statistics: max pooling, GroupSort +etc. This toolkit notably allows us to establish generalization bounds for +modern neural networks that are not only the most widely applicable path-norm +based ones, but also recover or beat the sharpest known bounds of this type. +These extended path-norms further enjoy the usual benefits of path-norms: ease +of computation, invariance under the symmetries of the network, and improved +sharpness on feedforward networks compared to the product of operators' norms, +another complexity measure most commonly used. + The versatility of the toolkit and its ease of implementation allow us to +challenge the concrete promises of path-norm-based generalization bounds, by +numerically evaluating the sharpest known bounds for ResNets on ImageNet. + +
+
+
+
+
+ + ♻ ☆ Provably Efficient High-Dimensional Bandit Learning with Batched + Feedbacks + + +
+ We study high-dimensional multi-armed contextual bandits with batched +feedback where the $T$ steps of online interactions are divided into $L$ +batches. In specific, each batch collects data according to a policy that +depends on previous batches and the rewards are revealed only at the end of the +batch. Such a feedback structure is popular in applications such as +personalized medicine and online advertisement, where the online data often do +not arrive in a fully serial manner. We consider high-dimensional and linear +settings where the reward function of the bandit model admits either a sparse +or low-rank structure and ask how small a number of batches are needed for a +comparable performance with fully dynamic data in which $L = T$. For these +settings, we design a provably sample-efficient algorithm which achieves a $ +\mathcal{\tilde O}(s_0^2 \log^2 T)$ regret in the sparse case and $ +\mathcal{\tilde O} ( r ^2 \log^2 T)$ regret in the low-rank case, using only $L += \mathcal{O}( \log T)$ batches. Here $s_0$ and $r$ are the sparsity and rank +of the reward parameter in sparse and low-rank cases, respectively, and $ +\mathcal{\tilde O}(\cdot)$ omits logarithmic factors involving the feature +dimensions. In other words, our algorithm achieves regret bounds comparable to +those in fully sequential setting with only $\mathcal{O}( \log T)$ batches. Our +algorithm features a novel batch allocation method that adjusts the batch sizes +according to the estimation accuracy within each batch and cumulative regret. +Furthermore, we also conduct experiments with synthetic and real-world data to +validate our theory. + +
+
+
+
+
+ + ♻ ☆ How Over-Parameterization Slows Down Gradient Descent in Matrix Sensing: + The Curses of Symmetry and Initialization + + +
+ This paper rigorously shows how over-parameterization changes the convergence +behaviors of gradient descent (GD) for the matrix sensing problem, where the +goal is to recover an unknown low-rank ground-truth matrix from near-isotropic +linear measurements. First, we consider the symmetric setting with the +symmetric parameterization where $M^* \in \mathbb{R}^{n \times n}$ is a +positive semi-definite unknown matrix of rank $r \ll n$, and one uses a +symmetric parameterization $XX^\top$ to learn $M^*$. Here $X \in \mathbb{R}^{n +\times k}$ with $k > r$ is the factor matrix. We give a novel $\Omega (1/T^2)$ +lower bound of randomly initialized GD for the over-parameterized case ($k >r$) +where $T$ is the number of iterations. This is in stark contrast to the +exact-parameterization scenario ($k=r$) where the convergence rate is $\exp +(-\Omega (T))$. Next, we study asymmetric setting where $M^* \in +\mathbb{R}^{n_1 \times n_2}$ is the unknown matrix of rank $r \ll +\min\{n_1,n_2\}$, and one uses an asymmetric parameterization $FG^\top$ to +learn $M^*$ where $F \in \mathbb{R}^{n_1 \times k}$ and $G \in \mathbb{R}^{n_2 +\times k}$. Building on prior work, we give a global exact convergence result +of randomly initialized GD for the exact-parameterization case ($k=r$) with an +$\exp (-\Omega(T))$ rate. Furthermore, we give the first global exact +convergence result for the over-parameterization case ($k>r$) with an +$\exp(-\Omega(\alpha^2 T))$ rate where $\alpha$ is the initialization scale. +This linear convergence result in the over-parameterization case is especially +significant because one can apply the asymmetric parameterization to the +symmetric setting to speed up from $\Omega (1/T^2)$ to linear convergence. On +the other hand, we propose a novel method that only modifies one step of GD and +obtains a convergence rate independent of $\alpha$, recovering the rate in the +exact-parameterization case. + +
+
+
+
+
+ + ♻ ☆ EGraFFBench: Evaluation of Equivariant Graph Neural Network Force Fields + for Atomistic Simulations + + +
+ Equivariant graph neural networks force fields (EGraFFs) have shown great +promise in modelling complex interactions in atomic systems by exploiting the +graphs' inherent symmetries. Recent works have led to a surge in the +development of novel architectures that incorporate equivariance-based +inductive biases alongside architectural innovations like graph transformers +and message passing to model atomic interactions. However, thorough evaluations +of these deploying EGraFFs for the downstream task of real-world atomistic +simulations, is lacking. To this end, here we perform a systematic benchmarking +of 6 EGraFF algorithms (NequIP, Allegro, BOTNet, MACE, Equiformer, TorchMDNet), +with the aim of understanding their capabilities and limitations for realistic +atomistic simulations. In addition to our thorough evaluation and analysis on +eight existing datasets based on the benchmarking literature, we release two +new benchmark datasets, propose four new metrics, and three challenging tasks. +The new datasets and tasks evaluate the performance of EGraFF to +out-of-distribution data, in terms of different crystal structures, +temperatures, and new molecules. Interestingly, evaluation of the EGraFF models +based on dynamic simulations reveals that having a lower error on energy or +force does not guarantee stable or reliable simulation or faithful replication +of the atomic structures. Moreover, we find that no model clearly outperforms +other models on all datasets and tasks. Importantly, we show that the +performance of all the models on out-of-distribution datasets is unreliable, +pointing to the need for the development of a foundation model for force fields +that can be used in real-world simulations. In summary, this work establishes a +rigorous framework for evaluating machine learning force fields in the context +of atomic simulations and points to open research challenges within this +domain. + +
+
+
+
+
+ + ♻ ☆ XAutoML: A Visual Analytics Tool for Understanding and Validating + Automated Machine Learning + + +
+ In the last ten years, various automated machine learning (AutoM ) systems +have been proposed to build end-to-end machine learning (ML) pipelines with +minimal human interaction. Even though such automatically synthesized ML +pipelines are able to achieve a competitive performance, recent studies have +shown that users do not trust models constructed by AutoML due to missing +transparency of AutoML systems and missing explanations for the constructed ML +pipelines. In a requirements analysis study with 36 domain experts, data +scientists, and AutoML researchers from different professions with vastly +different expertise in ML, we collect detailed informational needs for AutoML. +We propose XAutoML, an interactive visual analytics tool for explaining +arbitrary AutoML optimization procedures and ML pipelines constructed by +AutoML. XAutoML combines interactive visualizations with established techniques +from explainable artificial intelligence (XAI) to make the complete AutoML +procedure transparent and explainable. By integrating XAutoML with JupyterLab, +experienced users can extend the visual analytics with ad-hoc visualizations +based on information extracted from XAutoML. We validate our approach in a user +study with the same diverse user group from the requirements analysis. All +participants were able to extract useful information from XAutoML, leading to a +significantly increased understanding of ML pipelines produced by AutoML and +the AutoML optimization itself. + +
+
+ comment: Revised version accepted at ACM TiiS Special Issue on Human-centered + Explainable AI +
+
+
+
+
+ + ♻ ☆ Dungeons and Data: A Large-Scale NetHack Dataset NeurIPS 2022 + + +
+ Recent breakthroughs in the development of agents to solve challenging +sequential decision making problems such as Go, StarCraft, or DOTA, have relied +on both simulated environments and large-scale datasets. However, progress on +this research has been hindered by the scarcity of open-sourced datasets and +the prohibitive computational cost to work with them. Here we present the +NetHack Learning Dataset (NLD), a large and highly-scalable dataset of +trajectories from the popular game of NetHack, which is both extremely +challenging for current methods and very fast to run. NLD consists of three +parts: 10 billion state transitions from 1.5 million human trajectories +collected on the NAO public NetHack server from 2009 to 2020; 3 billion +state-action-score transitions from 100,000 trajectories collected from the +symbolic bot winner of the NetHack Challenge 2021; and, accompanying code for +users to record, load and stream any collection of such trajectories in a +highly compressed form. We evaluate a wide range of existing algorithms +including online and offline RL, as well as learning from demonstrations, +showing that significant research advances are needed to fully leverage +large-scale datasets for challenging sequential decision making tasks. + +
+
+ comment: 9 pages, published in the Proceedings of the 36th Conference on + Neural Information Processing Systems (NeurIPS 2022) Track on Datasets and + Benchmarks. New links to hosting location. Revised results, same conclusions +
+
+
+
+
+ + ♻ ☆ Knowledge Accumulation in Continually Learned Representations and the + Issue of Feature Forgetting + + +
+ While it is established that neural networks suffer from catastrophic +forgetting ``at the output level'', it is debated whether this is also the case +at the level of representations. Some studies ascribe a certain level of innate +robustness to representations, that they only forget minimally and no critical +information, while others claim that representations are also severely affected +by forgetting. To settle this debate, we first discuss how this apparent +disagreement might stem from the coexistence of two phenomena that affect the +quality of continually learned representations: knowledge accumulation and +feature forgetting. We then show that, even though it is true that feature +forgetting can be small in absolute terms, newly learned information is +forgotten just as catastrophically at the level of representations as it is at +the output level. Next we show that this feature forgetting is problematic as +it substantially slows down knowledge accumulation. We further show that +representations that are continually learned through both supervised and +self-supervised learning suffer from feature forgetting. Finally, we study how +feature forgetting and knowledge accumulation are affected by different types +of continual learning methods. + +
+
+
+
+
+ + ♻ ☆ Navigating the Design Space of Equivariant Diffusion-Based Generative + Models for De Novo 3D Molecule Generation + + +
+ Deep generative diffusion models are a promising avenue for 3D de novo +molecular design in materials science and drug discovery. However, their +utility is still limited by suboptimal performance on large molecular +structures and limited training data. To address this gap, we explore the +design space of E(3)-equivariant diffusion models, focusing on previously +unexplored areas. Our extensive comparative analysis evaluates the interplay +between continuous and discrete state spaces. From this investigation, we +present the EQGAT-diff model, which consistently outperforms established models +for the QM9 and GEOM-Drugs datasets. Significantly, EQGAT-diff takes continuous +atom positions, while chemical elements and bond types are categorical and uses +time-dependent loss weighting, substantially increasing training convergence, +the quality of generated samples, and inference time. We also showcase that +including chemically motivated additional features like hybridization states in +the diffusion process enhances the validity of generated molecules. To further +strengthen the applicability of diffusion models to limited training data, we +investigate the transferability of EQGAT-diff trained on the large PubChem3D +dataset with implicit hydrogen atoms to target different data distributions. +Fine-tuning EQGAT-diff for just a few iterations shows an efficient +distribution shift, further improving performance throughout data sets. +Finally, we test our model on the Crossdocked data set for structure-based de +novo ligand generation, underlining the importance of our findings showing +state-of-the-art performance on Vina docking scores. + +
+
+
+
+
+ + ♻ ☆ Generating and Imputing Tabular Data via Diffusion and Flow-based + Gradient-Boosted Trees + + +
+ Tabular data is hard to acquire and is subject to missing values. This paper +proposes a novel approach to generate and impute mixed-type (continuous and +categorical) tabular data using score-based diffusion and conditional flow +matching. Contrary to previous work that relies on neural networks to learn the +score function or the vector field, we instead rely on XGBoost, a popular +Gradient-Boosted Tree (GBT) method. We empirically show on 27 different +datasets that our approach i) generates highly realistic synthetic data when +the training dataset is either clean or tainted by missing data and ii) +generates diverse plausible data imputations. Furthermore, our method +outperforms deep-learning generation methods on data generation and is +competitive on data imputation. Finally, it can be trained in parallel using +CPUs without the need for a GPU. To make it easily accessible, we release our +code through a Python library and an R package. + +
+
+ comment: Code: https://github.com/SamsungSAILMontreal/ForestDiffusion +
+
+
+
+
+ + ♻ ☆ Interpretable and intervenable ultrasonography-based machine learning + models for pediatric appendicitis + + +
+ Appendicitis is among the most frequent reasons for pediatric abdominal +surgeries. Previous decision support systems for appendicitis have focused on +clinical, laboratory, scoring, and computed tomography data and have ignored +abdominal ultrasound, despite its noninvasive nature and widespread +availability. In this work, we present interpretable machine learning models +for predicting the diagnosis, management and severity of suspected appendicitis +using ultrasound images. Our approach utilizes concept bottleneck models (CBM) +that facilitate interpretation and interaction with high-level concepts +understandable to clinicians. Furthermore, we extend CBMs to prediction +problems with multiple views and incomplete concept sets. Our models were +trained on a dataset comprising 579 pediatric patients with 1709 ultrasound +images accompanied by clinical and laboratory data. Results show that our +proposed method enables clinicians to utilize a human-understandable and +intervenable predictive model without compromising performance or requiring +time-consuming image annotation when deployed. For predicting the diagnosis, +the extended multiview CBM attained an AUROC of 0.80 and an AUPR of 0.92, +performing comparably to similar black-box neural networks trained and tested +on the same dataset. + +
+
+ comment: Published in Medical Image Analysis (Elsevier) +
+
+
+
+
+ + ♻ ☆ Fair Data Representation for Machine Learning at the Pareto Frontier + + +
+ As machine learning powered decision-making becomes increasingly important in +our daily lives, it is imperative to strive for fairness in the underlying data +processing. We propose a pre-processing algorithm for fair data representation +via which supervised learning results in estimations of the Pareto frontier +between prediction error and statistical disparity. Particularly, the present +work applies the optimal affine transport to approach the post-processing +Wasserstein-2 barycenter characterization of the optimal fair $L^2$-objective +supervised learning via a pre-processing data deformation. Furthermore, we show +that the Wasserstein-2 geodesics from the conditional (on sensitive +information) distributions of the learning outcome to their barycenter +characterizes the Pareto frontier between $L^2$-loss and the average pairwise +Wasserstein-2 distance among sensitive groups on the learning outcome. +Numerical simulations underscore the advantages: (1) the pre-processing step is +compositive with arbitrary conditional expectation estimation supervised +learning methods and unseen data; (2) the fair representation protects the +sensitive information by limiting the inference capability of the remaining +data with respect to the sensitive data; (3) the optimal affine maps are +computationally efficient even for high-dimensional data. + +
+
+ comment: 63 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ FIKIT: Priority-Based Real-time GPU Multi-tasking Scheduling with Kernel + Identification + + +
+ Highly parallelized workloads like machine learning training, inferences and +general HPC tasks are greatly accelerated using GPU devices. In a cloud +computing cluster, serving a GPU's computation power through multi-tasks +sharing is highly demanded since there are always more task requests than the +number of GPU available. Existing GPU sharing solutions focus on reducing +task-level waiting time or task-level switching costs when multiple jobs +competing for a single GPU. Non-stopped computation requests come with +different priorities, having non-symmetric impact on QoS for sharing a GPU +device. Existing work missed the kernel-level optimization opportunity brought +by this setting. To address this problem, we present a novel kernel-level +scheduling strategy called FIKIT: Filling Inter-kernel Idle Time. FIKIT +incorporates task-level priority information, fine-grained kernel +identification, and kernel measurement, allowing low priorities task's +execution during high priority task's inter-kernel idle time. Thereby, filling +the GPU's device runtime fully, and reduce overall GPU sharing impact to cloud +services. Across a set of ML models, the FIKIT based inference system +accelerated high priority tasks by 1.33 to 14.87 times compared to the JCT in +GPU sharing mode, and more than half of the cases are accelerated by more than +3.5 times. Alternatively, under preemptive sharing, the low-priority tasks have +a comparable to default GPU sharing mode JCT, with a 0.84 to 1 times ratio. We +further limit the kernel measurement and runtime fine-grained kernel scheduling +overhead to less than 10%. + +
+
+ comment: 19 pages, 18 figures. Shorten the introduction section; Move some + content from the introduction to the design section; Add Dataset References +
+
+
+
+
+ + ♻ ☆ Real Robot Challenge 2022: Learning Dexterous Manipulation from Offline + Data in the Real World + + +
+ Experimentation on real robots is demanding in terms of time and costs. For +this reason, a large part of the reinforcement learning (RL) community uses +simulators to develop and benchmark algorithms. However, insights gained in +simulation do not necessarily translate to real robots, in particular for tasks +involving complex interactions with the environment. The Real Robot Challenge +2022 therefore served as a bridge between the RL and robotics communities by +allowing participants to experiment remotely with a real robot - as easily as +in simulation. + In the last years, offline reinforcement learning has matured into a +promising paradigm for learning from pre-collected datasets, alleviating the +reliance on expensive online interactions. We therefore asked the participants +to learn two dexterous manipulation tasks involving pushing, grasping, and +in-hand orientation from provided real-robot datasets. An extensive software +documentation and an initial stage based on a simulation of the real set-up +made the competition particularly accessible. By giving each team plenty of +access budget to evaluate their offline-learned policies on a cluster of seven +identical real TriFinger platforms, we organized an exciting competition for +machine learners and roboticists alike. + In this work we state the rules of the competition, present the methods used +by the winning teams and compare their results with a benchmark of +state-of-the-art offline RL algorithms on the challenge datasets. + +
+
+ comment: Typo in author list fixed +
+
+
+
+
+ + ♻ ☆ On Neural Quantum Support Vector Machines + + +
+ In \cite{simon2023algorithms} we introduced four algorithms for the training +of neural support vector machines (NSVMs) and demonstrated their feasibility. +In this note we introduce neural quantum support vector machines, that is, +NSVMs with a quantum kernel, and extend our results to this setting. + +
+
+ comment: 16 pages, 1 figure. arXiv admin note: substantial text overlap with + arXiv:2308.07204 +
+
+
+
+
+ + ♻ ☆ Regret Analysis of Learning-Based Linear Quadratic Gaussian Control with + Additive Exploration + + +
+ In this paper, we analyze the regret incurred by a computationally efficient +exploration strategy, known as naive exploration, for controlling unknown +partially observable systems within the Linear Quadratic Gaussian (LQG) +framework. We introduce a two-phase control algorithm called LQG-NAIVE, which +involves an initial phase of injecting Gaussian input signals to obtain a +system model, followed by a second phase of an interplay between naive +exploration and control in an episodic fashion. We show that LQG-NAIVE achieves +a regret growth rate of $\tilde{\mathcal{O}}(\sqrt{T})$, i.e., +$\mathcal{O}(\sqrt{T})$ up to logarithmic factors after $T$ time steps, and we +validate its performance through numerical simulations. Additionally, we +propose LQG-IF2E, which extends the exploration signal to a `closed-loop' +setting by incorporating the Fisher Information Matrix (FIM). We provide +compelling numerical evidence of the competitive performance of LQG-IF2E +compared to LQG-NAIVE. + +
+
+
+
+
+ + ♻ ☆ Benchmarking Multimodal Variational Autoencoders: CdSprites+ Dataset and + Toolkit + + +
+ Multimodal Variational Autoencoders (VAEs) have been the subject of intense +research in the past years as they can integrate multiple modalities into a +joint representation and can thus serve as a promising tool for both data +classification and generation. Several approaches toward multimodal VAE +learning have been proposed so far, their comparison and evaluation have +however been rather inconsistent. One reason is that the models differ at the +implementation level, another problem is that the datasets commonly used in +these cases were not initially designed to evaluate multimodal generative +models. This paper addresses both mentioned issues. First, we propose a toolkit +for systematic multimodal VAE training and comparison. The toolkit currently +comprises 4 existing multimodal VAEs and 6 commonly used benchmark datasets +along with instructions on how to easily add a new model or a dataset. Second, +we present a disentangled bimodal dataset designed to comprehensively evaluate +the joint generation and cross-generation capabilities across multiple +difficulty levels. We demonstrate the utility of our dataset by comparing the +implemented state-of-the-art models. + +
+
+
+
+
+ + ♻ ☆ Sharing pattern submodels for prediction with missing values + + +
+ Missing values are unavoidable in many applications of machine learning and +present challenges both during training and at test time. When variables are +missing in recurring patterns, fitting separate pattern submodels have been +proposed as a solution. However, fitting models independently does not make +efficient use of all available data. Conversely, fitting a single shared model +to the full data set relies on imputation which often leads to biased results +when missingness depends on unobserved factors. We propose an alternative +approach, called sharing pattern submodels, which i) makes predictions that are +robust to missing values at test time, ii) maintains or improves the predictive +power of pattern submodels, and iii) has a short description, enabling improved +interpretability. Parameter sharing is enforced through sparsity-inducing +regularization which we prove leads to consistent estimation. Finally, we give +conditions for when a sharing model is optimal, even when both missingness and +the target outcome depend on unobserved variables. Classification and +regression experiments on synthetic and real-world data sets demonstrate that +our models achieve a favorable tradeoff between pattern specialization and +information sharing. + +
+
+
+
+
+ + ♻ ☆ PEAR: Primitive enabled Adaptive Relabeling for boosting Hierarchical + Reinforcement Learning + + +
+ Hierarchical reinforcement learning (HRL) has the potential to solve complex +long horizon tasks using temporal abstraction and increased exploration. +However, hierarchical agents are difficult to train due to inherent +non-stationarity. We present primitive enabled adaptive relabeling (PEAR), a +two-phase approach where we first perform adaptive relabeling on a few expert +demonstrations to generate efficient subgoal supervision, and then jointly +optimize HRL agents by employing reinforcement learning (RL) and imitation +learning (IL). We perform theoretical analysis to $(i)$ bound the +sub-optimality of our approach, and $(ii)$ derive a generalized plug-and-play +framework for joint optimization using RL and IL. PEAR uses a handful of expert +demonstrations and makes minimal limiting assumptions on the task structure. +Additionally, it can be easily integrated with typical model free RL algorithms +to produce a practical HRL algorithm. We perform experiments on challenging +robotic environments and show that PEAR is able to solve tasks that require +long term decision making. We empirically show that PEAR exhibits improved +performance and sample efficiency over previous hierarchical and +non-hierarchical approaches. We also perform real world robotic experiments on +complex tasks and demonstrate that PEAR consistently outperforms the baselines. + +
+
+
+
+
+ + ♻ ☆ Physics-Informed Graph Convolutional Networks: Towards a generalized + framework for complex geometries + + +
+ Since the seminal work of [9] and their Physics-Informed neural networks +(PINNs), many efforts have been conducted towards solving partial differential +equations (PDEs) with Deep Learning models. However, some challenges remain, +for instance the extension of such models to complex three-dimensional +geometries, and a study on how such approaches could be combined to classical +numerical solvers. In this work, we justify the use of graph neural networks +for these problems, based on the similarity between these architectures and the +meshes used in traditional numerical techniques for solving partial +differential equations. After proving an issue with the Physics-Informed +framework for complex geometries, during the computation of PDE residuals, an +alternative procedure is proposed, by combining classical numerical solvers and +the Physics-Informed framework. Finally, we propose an implementation of this +approach, that we test on a three-dimensional problem on an irregular geometry. + +
+
+
+
+
+ + ♻ ☆ An Initialization Schema for Neuronal Networks on Tabular Data + + +
+ Nowadays, many modern applications require heterogeneous tabular data, which +is still a challenging task in terms of regression and classification. Many +approaches have been proposed to adapt neural networks for this task, but +still, boosting and bagging of decision trees are the best-performing methods +for this task. In this paper, we show that a binomial initialized neural +network can be used effectively on tabular data. The proposed approach shows a +simple but effective approach for initializing the first hidden layer in neural +networks. We also show that this initializing schema can be used to jointly +train ensembles by adding gradient masking to batch entries and using the +binomial initialization for the last layer in a neural network. For this +purpose, we modified the hinge binary loss and the soft max loss to make them +applicable for joint ensemble training. We evaluate our approach on multiple +public datasets and showcase the improved performance compared to other neural +network-based approaches. In addition, we discuss the limitations and possible +further research of our approach for improving the applicability of neural +networks to tabular data. + Link: +https://es-cloud.cs.uni-tuebingen.de/d/8e2ab8c3fdd444e1a135/?p=%2FInitializationNeuronalNetworksTabularData&mode=list + +
+
+
+
+
+ + ♻ ☆ Upgrading VAE Training With Unlimited Data Plans Provided by Diffusion + Models + + +
+ Variational autoencoders (VAEs) are popular models for representation +learning but their encoders are susceptible to overfitting (Cremer et al., +2018) because they are trained on a finite training set instead of the true +(continuous) data distribution $p_{\mathrm{data}}(\mathbf{x})$. Diffusion +models, on the other hand, avoid this issue by keeping the encoder fixed. This +makes their representations less interpretable, but it simplifies training, +enabling accurate and continuous approximations of +$p_{\mathrm{data}}(\mathbf{x})$. In this paper, we show that overfitting +encoders in VAEs can be effectively mitigated by training on samples from a +pre-trained diffusion model. These results are somewhat unexpected as recent +findings (Alemohammad et al., 2023; Shumailov et al., 2023) observe a decay in +generative performance when models are trained on data generated by another +generative model. We analyze generalization performance, amortization gap, and +robustness of VAEs trained with our proposed method on three different data +sets. We find improvements in all metrics compared to both normal training and +conventional data augmentation methods, and we show that a modest amount of +samples from the diffusion model suffices to obtain these gains. + +
+
+ comment: 9 pages + appendix +
+
+
+
+
+ + ♻ ☆ Physics-Constrained Neural Network for Design and Feature-Based + Optimization of Weave Architectures + + +
+ Woven fabrics play an essential role in everyday textiles for +clothing/sportswear, water filtration, and retaining walls, to reinforcements +in stiff composites for lightweight structures like aerospace, sporting, +automotive, and marine industries. Several possible combinations of weave +patterns and material choices, which comprise weave architecture, present a +challenging question about how they could influence the physical and mechanical +properties of woven fabrics and reinforced structures. In this paper, we +present a novel Physics-Constrained Neural Network (PCNN) to predict the +mechanical properties like the modulus of weave architectures and the inverse +problem of predicting pattern/material sequence for a design/target modulus +value. The inverse problem is particularly challenging as it usually requires +many iterations to find the appropriate architecture using traditional +optimization approaches. We show that the proposed PCNN can effectively predict +weave architecture for the desired modulus with higher accuracy than several +baseline models considered. We present a feature-based optimization strategy to +improve the predictions using features in the Grey Level Co-occurrence Matrix +(GLCM) space. We combine PCNN with this feature-based optimization to discover +near-optimal weave architectures to facilitate the initial design of weave +architecture. The proposed frameworks will primarily enable the woven composite +analysis and optimization process, and be a starting point to introduce +Knowledge-guided Neural Networks into the complex structural analysis. + +
+
+
+
+
+ + ♻ ☆ Scalable and Transferable Black-Box Jailbreaks for Language Models via + Persona Modulation + + +
+ Despite efforts to align large language models to produce harmless responses, +they are still vulnerable to jailbreak prompts that elicit unrestricted +behaviour. In this work, we investigate persona modulation as a black-box +jailbreaking method to steer a target model to take on personalities that are +willing to comply with harmful instructions. Rather than manually crafting +prompts for each persona, we automate the generation of jailbreaks using a +language model assistant. We demonstrate a range of harmful completions made +possible by persona modulation, including detailed instructions for +synthesising methamphetamine, building a bomb, and laundering money. These +automated attacks achieve a harmful completion rate of 42.5% in GPT-4, which is +185 times larger than before modulation (0.23%). These prompts also transfer to +Claude 2 and Vicuna with harmful completion rates of 61.0% and 35.9%, +respectively. Our work reveals yet another vulnerability in commercial large +language models and highlights the need for more comprehensive safeguards. + +
+
+
+
+
+ + ♻ ☆ Supervised Feature Compression based on Counterfactual Analysis + + +
+ Counterfactual Explanations are becoming a de-facto standard in post-hoc +interpretable machine learning. For a given classifier and an instance +classified in an undesired class, its counterfactual explanation corresponds to +small perturbations of that instance that allows changing the classification +outcome. This work aims to leverage Counterfactual Explanations to detect the +important decision boundaries of a pre-trained black-box model. This +information is used to build a supervised discretization of the features in the +dataset with a tunable granularity. Using the discretized dataset, an optimal +Decision Tree can be trained that resembles the black-box model, but that is +interpretable and compact. Numerical results on real-world datasets show the +effectiveness of the approach in terms of accuracy and sparsity. + +
+
+ comment: 30 pages, 45figures +
+
+
+
+
+ + ♻ ☆ DAS-N2N: Machine learning Distributed Acoustic Sensing (DAS) signal + denoising without clean data + + +
+ This article presents a weakly supervised machine learning method, which we +call DAS-N2N, for suppressing strong random noise in distributed acoustic +sensing (DAS) recordings. DAS-N2N requires no manually produced labels (i.e., +pre-determined examples of clean event signals or sections of noise) for +training and aims to map random noise processes to a chosen summary statistic, +such as the distribution mean, median or mode, whilst retaining the true +underlying signal. This is achieved by splicing (joining together) two fibres +hosted within a single optical cable, recording two noisy copies of the same +underlying signal corrupted by different independent realizations of random +observational noise. A deep learning model can then be trained using only these +two noisy copies of the data to produce a near fully-denoised copy. Once the +model is trained, only noisy data from a single fibre is required. Using a +dataset from a DAS array deployed on the surface of the Rutford Ice Stream in +Antarctica, we demonstrate that DAS-N2N greatly suppresses incoherent noise and +enhances the signal-to-noise ratios (SNR) of natural microseismic icequake +events. We further show that this approach is inherently more efficient and +effective than standard stop/pass band and white noise (e.g., Wiener) filtering +routines, as well as a comparable self-supervised learning method based on +masking individual DAS channels. Our preferred model for this task is +lightweight, processing 30 seconds of data recorded at a sampling frequency of +1000 Hz over 985 channels (approx. 1 km of fiber) in $<$1 s. Due to the high +noise levels in DAS recordings, efficient data-driven denoising methods, such +as DAS-N2N, will prove essential to time-critical DAS earthquake detection, +particularly in the case of microseismic monitoring. + +
+
+ comment: Submitted for publication to Geophysical Journal International. For + the purpose of open access, the author(s) has applied a Creative Commons + Attribution (CC BY) licence to the Author Accepted Manuscript version arising + from this submission +
+
+
+
+
+ + ♻ ☆ Proactive DP: A Multple Target Optimization Framework for DP-SGD + + +
+ We introduce a multiple target optimization framework for DP-SGD referred to +as pro-active DP. In contrast to traditional DP accountants, which are used to +track the expenditure of privacy budgets, the pro-active DP scheme allows one +to {\it a-priori} select parameters of DP-SGD based on a fixed privacy budget +(in terms of $\epsilon$ and $\delta$) in such a way to optimize the anticipated +utility (test accuracy) the most. To achieve this objective, we first propose +significant improvements to the moment account method, presenting a closed-form +$(\epsilon,\delta)$-DP guarantee that connects all parameters in the DP-SGD +setup. Generally, DP-SGD is $(\epsilon\leq 1/2,\delta=1/N)$-DP if +$\sigma=\sqrt{2(\epsilon +\ln(1/\delta))/\epsilon}$ with $T$ at least $\approx +2k^2/\epsilon$ and $(2/e)^2k^2-1/2\geq \ln(N)$, where $T$ is the total number +of rounds, and $K=kN$ is the total number of gradient computations where $k$ +measures $K$ in number of epochs of size $N$ of the local data set. We prove +that our expression is close to tight in that if $T$ is more than a constant +factor $\approx 4$ smaller than the lower bound $\approx 2k^2/\epsilon$, then +the $(\epsilon,\delta)$-DP guarantee is violated. Our enhanced DP theory allows +us to create a utility graph and DP calculator. These tools link privacy and +utility objectives and search for optimal experiment setups, efficiently taking +into account both accuracy and privacy objectives, as well as implementation +goals. We furnish a comprehensive implementation flow of our proactive DP, with +rigorous experiments to showcase the proof-of-concept. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2007.09208, changes in + contents and title +
+
+
+
+
+ + ♻ ☆ Neural Algorithmic Reasoning for Combinatorial Optimisation + + +
+ Solving NP-hard/complete combinatorial problems with neural networks is a +challenging research area that aims to surpass classical approximate +algorithms. The long-term objective is to outperform hand-designed heuristics +for NP-hard/complete problems by learning to generate superior solutions solely +from training data. Current neural-based methods for solving CO problems often +overlook the inherent "algorithmic" nature of the problems. In contrast, +heuristics designed for CO problems, e.g. TSP, frequently leverage +well-established algorithms, such as those for finding the minimum spanning +tree. In this paper, we propose leveraging recent advancements in neural +algorithmic reasoning to improve the learning of CO problems. Specifically, we +suggest pre-training our neural model on relevant algorithms before training it +on CO instances. Our results demonstrate that by using this learning setup, we +achieve superior performance compared to non-algorithmically informed deep +learning models. + +
+
+
+
+
+ + ♻ ☆ Towards a more inductive world for drug repurposing approaches + + +
+ Drug-target interaction (DTI) prediction is a challenging, albeit essential +task in drug repurposing. Learning on graph models have drawn special attention +as they can significantly reduce drug repurposing costs and time commitment. +However, many current approaches require high-demanding additional information +besides DTIs that complicates their evaluation process and usability. +Additionally, structural differences in the learning architecture of current +models hinder their fair benchmarking. In this work, we first perform an +in-depth evaluation of current DTI datasets and prediction models through a +robust benchmarking process, and show that DTI prediction methods based on +transductive models lack generalization and lead to inflated performance when +evaluated as previously done in the literature, hence not being suited for drug +repurposing approaches. We then propose a novel biologically-driven strategy +for negative edge subsampling and show through in vitro validation that newly +discovered interactions are indeed true. We envision this work as the +underpinning for future fair benchmarking and robust model design. All +generated resources and tools are publicly available as a python package. + +
+
+
+
+
+ + ♻ ☆ The Noise Geometry of Stochastic Gradient Descent: A Quantitative and + Analytical Characterization + + +
+ Empirical studies have demonstrated that the noise in stochastic gradient +descent (SGD) aligns favorably with the local geometry of loss landscape. +However, theoretical and quantitative explanations for this phenomenon remain +sparse. In this paper, we offer a comprehensive theoretical investigation into +the aforementioned {\em noise geometry} for over-parameterized linear (OLMs) +models and two-layer neural networks. We scrutinize both average and +directional alignments, paying special attention to how factors like sample +size and input data degeneracy affect the alignment strength. As a specific +application, we leverage our noise geometry characterizations to study how SGD +escapes from sharp minima, revealing that the escape direction has significant +components along flat directions. This is in stark contrast to GD, which +escapes only along the sharpest directions. To substantiate our theoretical +findings, both synthetic and real-world experiments are provided. + +
+
+ comment: 31 pages +
+
+
+
+
+ + ♻ ☆ A Bayesian Take on Gaussian Process Networks + + +
+ Gaussian Process Networks (GPNs) are a class of directed graphical models +which employ Gaussian processes as priors for the conditional expectation of +each variable given its parents in the network. The model allows the +description of continuous joint distributions in a compact but flexible manner +with minimal parametric assumptions on the dependencies between variables. +Bayesian structure learning of GPNs requires computing the posterior over +graphs of the network and is computationally infeasible even in low dimensions. +This work implements Monte Carlo and Markov Chain Monte Carlo methods to sample +from the posterior distribution of network structures. As such, the approach +follows the Bayesian paradigm, comparing models via their marginal likelihood +and computing the posterior probability of the GPN features. Simulation studies +show that our method outperforms state-of-the-art algorithms in recovering the +graphical structure of the network and provides an accurate approximation of +its posterior distribution. + +
+
+
+
+
+ + ♻ ☆ Collective Relational Inference for learning heterogeneous interactions + + +
+ Interacting systems are ubiquitous in nature and engineering, ranging from +particle dynamics in physics to functionally connected brain regions. These +interacting systems can be modeled by graphs where edges correspond to the +interactions between interactive entities. Revealing interaction laws is of +fundamental importance but also particularly challenging due to underlying +configurational complexities. The associated challenges become exacerbated for +heterogeneous systems that are prevalent in reality, where multiple interaction +types coexist simultaneously and relational inference is required. Here, we +propose a novel probabilistic method for relational inference, which possesses +two distinctive characteristics compared to existing methods. First, it infers +the interaction types of different edges collectively, and second, it allows +handling systems with variable topological structure over time. We evaluate the +proposed methodology across several benchmark datasets and demonstrate that it +outperforms existing methods in accurately inferring interaction types. We +further show that when combined with known constraints, it allows us, for +example, to discover physics-consistent interaction laws of particle systems. +Overall the proposed model is data-efficient and generalizable to large systems +when trained on smaller ones. The developed methodology constitutes a key +element for understanding interacting systems and may find application in graph +structure learning. + +
+
+ comment: Under review. Links to the supporting code can be found at the end of + the main content +
+
+
+
+
+ + ♻ ☆ Transport with Support: Data-Conditional Diffusion Bridges + + +
+ The dynamic Schr\"odinger bridge problem provides an appealing setting for +solving constrained time-series data generation tasks posed as optimal +transport problems. It consists of learning non-linear diffusion processes +using efficient iterative solvers. Recent works have demonstrated +state-of-the-art results (eg. in modelling single-cell embryo RNA sequences or +sampling from complex posteriors) but are limited to learning bridges with only +initial and terminal constraints. Our work extends this paradigm by proposing +the Iterative Smoothing Bridge (ISB). We integrate Bayesian filtering and +optimal control into learning the diffusion process, enabling the generation of +constrained stochastic processes governed by sparse observations at +intermediate stages and terminal constraints. We assess the effectiveness of +our method on synthetic and real-world data generation tasks and we show that +the ISB generalises well to high-dimensional data, is computationally +efficient, and provides accurate estimates of the marginals at intermediate and +terminal times. + +
+
+ comment: 27 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Particle Guidance: non-I.I.D. Diverse Sampling with Diffusion Models + + +
+ In light of the widespread success of generative models, a significant amount +of research has gone into speeding up their sampling time. However, generative +models are often sampled multiple times to obtain a diverse set incurring a +cost that is orthogonal to sampling time. We tackle the question of how to +improve diversity and sample efficiency by moving beyond the common assumption +of independent samples. We propose particle guidance, an extension of +diffusion-based generative sampling where a joint-particle time-evolving +potential enforces diversity. We analyze theoretically the joint distribution +that particle guidance generates, how to learn a potential that achieves +optimal diversity, and the connections with methods in other disciplines. +Empirically, we test the framework both in the setting of conditional image +generation, where we are able to increase diversity without affecting quality, +and molecular conformer generation, where we reduce the state-of-the-art median +error by 13% on average. + +
+
+
+
+
+ + ♻ ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: This paper integrates the works arXiv:2306.01129 and arXiv:2308.16271 + into a complete story. In this paper, we improve the writing and + organization, and also add conceptual, empirical, and theoretical + improvements over the previous work. V2: small typo fixes and formatting + improvements +
+
+
+
+
+ + ♻ ☆ Graph of Thoughts: Solving Elaborate Problems with Large Language Models + + +
+ We introduce Graph of Thoughts (GoT): a framework that advances prompting +capabilities in large language models (LLMs) beyond those offered by paradigms +such as Chain-of-Thought or Tree of Thoughts (ToT). The key idea and primary +advantage of GoT is the ability to model the information generated by an LLM as +an arbitrary graph, where units of information ("LLM thoughts") are vertices, +and edges correspond to dependencies between these vertices. This approach +enables combining arbitrary LLM thoughts into synergistic outcomes, distilling +the essence of whole networks of thoughts, or enhancing thoughts using feedback +loops. We illustrate that GoT offers advantages over state of the art on +different tasks, for example increasing the quality of sorting by 62% over ToT, +while simultaneously reducing costs by >31%. We ensure that GoT is extensible +with new thought transformations and thus can be used to spearhead new +prompting schemes. This work brings the LLM reasoning closer to human thinking +or brain mechanisms such as recurrence, both of which form complex networks. + +
+
+
+
+
+ + ♻ ☆ Non-stationary Transformers: Exploring the Stationarity in Time Series + Forecasting + + +
+ Transformers have shown great power in time series forecasting due to their +global-range modeling ability. However, their performance can degenerate +terribly on non-stationary real-world data in which the joint distribution +changes over time. Previous studies primarily adopt stationarization to +attenuate the non-stationarity of original series for better predictability. +But the stationarized series deprived of inherent non-stationarity can be less +instructive for real-world bursty events forecasting. This problem, termed +over-stationarization in this paper, leads Transformers to generate +indistinguishable temporal attentions for different series and impedes the +predictive capability of deep models. To tackle the dilemma between series +predictability and model capability, we propose Non-stationary Transformers as +a generic framework with two interdependent modules: Series Stationarization +and De-stationary Attention. Concretely, Series Stationarization unifies the +statistics of each input and converts the output with restored statistics for +better predictability. To address the over-stationarization problem, +De-stationary Attention is devised to recover the intrinsic non-stationary +information into temporal dependencies by approximating distinguishable +attentions learned from raw series. Our Non-stationary Transformers framework +consistently boosts mainstream Transformers by a large margin, which reduces +MSE by 49.43% on Transformer, 47.34% on Informer, and 46.89% on Reformer, +making them the state-of-the-art in time series forecasting. Code is available +at this repository: https://github.com/thuml/Nonstationary_Transformers. + +
+
+
+
+
+ + ♻ ☆ Using Stochastic Gradient Descent to Smooth Nonconvex Functions: + Analysis of Implicit Graduated Optimization with Optimal Noise Scheduling + + +
+ The graduated optimization approach is a heuristic method for finding +globally optimal solutions for nonconvex functions and has been theoretically +analyzed in several studies. This paper defines a new family of nonconvex +functions for graduated optimization, discusses their sufficient conditions, +and provides a convergence analysis of the graduated optimization algorithm for +them. It shows that stochastic gradient descent (SGD) with mini-batch +stochastic gradients has the effect of smoothing the function, the degree of +which is determined by the learning rate and batch size. This finding provides +theoretical insights on why large batch sizes fall into sharp local minima, why +decaying learning rates and increasing batch sizes are superior to fixed +learning rates and batch sizes, and what the optimal learning rate scheduling +is. To the best of our knowledge, this is the first paper to provide a +theoretical explanation for these aspects. Moreover, a new graduated +optimization framework that uses a decaying learning rate and increasing batch +size is analyzed and experimental results of image classification that support +our theoretical findings are reported. + +
+
+ comment: The latest version was updated on Nov. 24 +
+
+
+
+
+ + ♻ ☆ Fast, Expressive SE$(n)$ Equivariant Networks through Weight-Sharing in + Position-Orientation Space + + +
+ Based on the theory of homogeneous spaces we derive \textit{geometrically +optimal edge attributes} to be used within the flexible message passing +framework. We formalize the notion of weight sharing in convolutional networks +as the sharing of message functions over point-pairs that should be treated +equally. We define equivalence classes of point-pairs that are identical up to +a transformation in the group and derive attributes that uniquely identify +these classes. Weight sharing is then obtained by conditioning message +functions on these attributes. As an application of the theory, we develop an +efficient equivariant group convolutional network for processing 3D point +clouds. The theory of homogeneous spaces tells us how to do group convolutions +with feature maps over the homogeneous space of positions $\mathbb{R}^3$, +position and orientations $\mathbb{R}^3 {\times} S^2$, and the group SE$(3)$ +itself. Among these, $\mathbb{R}^3 {\times} S^2$ is an optimal choice due to +the ability to represent directional information, which $\mathbb{R}^3$ methods +cannot, and it significantly enhances computational efficiency compared to +indexing features on the full SE$(3)$ group. We empirically support this claim +by reaching state-of-the-art results -- in accuracy and speed -- on three +different benchmarks: interatomic potential energy prediction, trajectory +forecasting in N-body systems, and generating molecules via equivariant +diffusion models. + +
+
+ comment: Our code is publicly available at https://github.com/ebekkers/ponita +
+
+
+
+
+ + ♻ ☆ Accurate battery lifetime prediction across diverse aging conditions + with deep learning + + +
+ Accurately predicting the lifetime of battery cells in early cycles holds +tremendous value for battery research and development as well as numerous +downstream applications. This task is rather challenging because diverse +conditions, such as electrode materials, operating conditions, and working +environments, collectively determine complex capacity-degradation behaviors. +However, current prediction methods are developed and validated under limited +aging conditions, resulting in questionable adaptability to varied aging +conditions and an inability to fully benefit from historical data collected +under different conditions. Here we introduce a universal deep learning +approach that is capable of accommodating various aging conditions and +facilitating effective learning under low-resource conditions by leveraging +data from rich conditions. Our key finding is that incorporating inter-cell +feature differences, rather than solely considering single-cell +characteristics, significantly increases the accuracy of battery lifetime +prediction and its cross-condition robustness. Accordingly, we develop a +holistic learning framework accommodating both single-cell and inter-cell +modeling. A comprehensive benchmark is built for evaluation, encompassing 401 +battery cells utilizing 5 prevalent electrode materials across 168 cycling +conditions. We demonstrate remarkable capabilities in learning across diverse +aging conditions, exclusively achieving 10% prediction error using the first +100 cycles, and in facilitating low-resource learning, almost halving the error +of single-cell modeling in many cases. More broadly, by breaking the learning +boundaries among different aging conditions, our approach could significantly +accelerate the development and optimization of lithium-ion batteries. + +
+
+
+
+
+ + ♻ ☆ A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of + Continuous Random Variables + + +
+ In this paper, I present a completely new type of upper and lower bounds on +the right-tail probabilities of continuous random variables with unbounded +support and with semi-bounded support from the left. The presented upper and +lower right-tail bounds depend only on the probability density function (PDF), +its first derivative, and two parameters that are used for tightening the +bounds. These tail bounds hold under certain conditions that depend on the PDF, +its first and second derivatives, and the two parameters. The new tail bounds +are shown to be tight for a wide range of continuous random variables via +numerical examples. + +
+
+ comment: Minor typos corrected +
+
+
+
+
+ + ♻ ☆ Reward Dropout Improves Control: Bi-objective Perspective on Reinforced + LM + + +
+ We study the theoretical aspects of Reinforced Language Models (RLMs) from a +bi-objective optimization perspective. Specifically, we consider the RLMs as a +Pareto optimization problem that maximizes the two conflicting objectives, +i.e., reward objective and likelihood objectives, simultaneously. Our main +contribution consists of three parts. First, we establish the theoretical +foundations of RLM as a Pareto optimization problem by presenting Reward Upper +BOund (RUBO) and Pareto optimality. Our theoretical outcomes are supported by +not only deductive proofs but also empirical results. Second, we propose Reward +Dropout, a simple yet powerful method that guarantees to improve a bi-objective +optimization of RLM. Lastly, we demonstrate that the Reward Dropout is +consistently effective across five benchmark datasets and four benchmark LLMs, +meaning that the Reward Dropout significantly improves the optimization +performance of RLMs. + +
+
+ comment: 29 pages, 13 figures, conference +
+
+
+
+
+ + ♻ ☆ Inverse Approximation Theory for Nonlinear Recurrent Neural Networks + + +
+ We prove an inverse approximation theorem for the approximation of nonlinear +sequence-to-sequence relationships using recurrent neural networks (RNNs). This +is a so-called Bernstein-type result in approximation theory, which deduces +properties of a target function under the assumption that it can be effectively +approximated by a hypothesis space. In particular, we show that nonlinear +sequence relationships that can be stably approximated by nonlinear RNNs must +have an exponential decaying memory structure - a notion that can be made +precise. This extends the previously identified curse of memory in linear RNNs +into the general nonlinear setting, and quantifies the essential limitations of +the RNN architecture for learning sequential relationships with long-term +memory. Based on the analysis, we propose a principled reparameterization +method to overcome the limitations. Our theoretical results are confirmed by +numerical experiments. The code has been released in +https://github.com/radarFudan/Curse-of-memory + +
+
+
+
+
+ + ♻ ☆ Task-Robust Pre-Training for Worst-Case Downstream Adaptation + + +
+ Pre-training has achieved remarkable success when transferred to downstream +tasks. In machine learning, we care about not only the good performance of a +model but also its behavior under reasonable shifts of condition. The same +philosophy holds when pre-training a foundation model. However, the foundation +model may not uniformly behave well for a series of related downstream tasks. +This happens, for example, when conducting mask recovery regression where the +recovery ability or the training instances diverge like pattern features are +extracted dominantly on pre-training, but semantic features are also required +on a downstream task. This paper considers pre-training a model that guarantees +a uniformly good performance over the downstream tasks. We call this goal as +$\textit{downstream-task robustness}$. Our method first separates the upstream +task into several representative ones and applies a simple minimax loss for +pre-training. We then design an efficient algorithm to solve the minimax loss +and prove its convergence in the convex setting. In the experiments, we show +both on large-scale natural language processing and computer vision datasets +our method increases the metrics on worse-case downstream tasks. Additionally, +some theoretical explanations for why our loss is beneficial are provided. +Specifically, we show fewer samples are inherently required for the most +challenging downstream task in some cases. + +
+
+
+
+
+ + ♻ ☆ Infinite forecast combinations based on Dirichlet process + + +
+ Forecast combination integrates information from various sources by +consolidating multiple forecast results from the target time series. Instead of +the need to select a single optimal forecasting model, this paper introduces a +deep learning ensemble forecasting model based on the Dirichlet process. +Initially, the learning rate is sampled with three basis distributions as +hyperparameters to convert the infinite mixture into a finite one. All +checkpoints are collected to establish a deep learning sub-model pool, and +weight adjustment and diversity strategies are developed during the combination +process. The main advantage of this method is its ability to generate the +required base learners through a single training process, utilizing the +decaying strategy to tackle the challenge posed by the stochastic nature of +gradient descent in determining the optimal learning rate. To ensure the +method's generalizability and competitiveness, this paper conducts an empirical +analysis using the weekly dataset from the M4 competition and explores +sensitivity to the number of models to be combined. The results demonstrate +that the ensemble model proposed offers substantial improvements in prediction +accuracy and stability compared to a single benchmark model. + +
+
+
+
+
+ + ♻ ☆ Signal Processing Meets SGD: From Momentum to Filter + + +
+ In the field of deep learning, Stochastic Gradient Descent (SGD) and its +momentum-based variants are the predominant choices for optimization +algorithms. Despite all that, these momentum strategies, which accumulate +historical gradients by using a fixed $\beta$ hyperparameter to smooth the +optimization processing, often neglect the potential impact of the variance of +historical gradients on the current gradient estimation. In the gradient +variance during training, fluctuation indicates the objective function does not +meet the Lipschitz continuity condition at all time, which raises the +troublesome optimization problem. This paper aims to explore the potential +benefits of reducing the variance of historical gradients to make optimizer +converge to flat solutions. Moreover, we proposed a new optimization method +based on reducing the variance. We employed the Wiener filter theory to enhance +the first moment estimation of SGD, notably introducing an adaptive weight to +optimizer. Specifically, the adaptive weight dynamically changes along with +temporal fluctuation of gradient variance during deep learning model training. +Experimental results demonstrated our proposed adaptive weight optimizer, SGDF +(Stochastic Gradient Descent With Filter), can achieve satisfactory performance +compared with state-of-the-art optimizers. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2010.07468 by other authors +
+
+
+
+
+ + ♻ ☆ Why do Angular Margin Losses work well for Semi-Supervised Anomalous + Sound Detection? + + +
+ State-of-the-art anomalous sound detection systems often utilize angular +margin losses to learn suitable representations of acoustic data using an +auxiliary task, which usually is a supervised or self-supervised classification +task. The underlying idea is that, in order to solve this auxiliary task, +specific information about normal data needs to be captured in the learned +representations and that this information is also sufficient to differentiate +between normal and anomalous samples. Especially in noisy conditions, +discriminative models based on angular margin losses tend to significantly +outperform systems based on generative or one-class models. The goal of this +work is to investigate why using angular margin losses with auxiliary tasks +works well for detecting anomalous sounds. To this end, it is shown, both +theoretically and experimentally, that minimizing angular margin losses also +minimizes compactness loss while inherently preventing learning trivial +solutions. Furthermore, multiple experiments are conducted to show that using a +related classification task as an auxiliary task teaches the model to learn +representations suitable for detecting anomalous sounds in noisy conditions. +Among these experiments are performance evaluations, visualizing the embedding +space with t-SNE and visualizing the input representations with respect to the +anomaly score using randomized input sampling for explanation. + +
+
+
+
+
+ + ♻ ☆ Imitation Bootstrapped Reinforcement Learning + + +
+ Despite the considerable potential of reinforcement learning (RL), robotics +control tasks predominantly rely on imitation learning (IL) owing to its better +sample efficiency. However, given the high cost of collecting extensive +demonstrations, RL is still appealing if it can utilize limited imitation data +for efficient autonomous self-improvement. Existing RL methods that utilize +demonstrations either initialize the replay buffer with demonstrations and +oversample them during RL training, which does not benefit from the +generalization potential of modern IL methods, or pretrain the RL policy with +IL on the demonstrations, which requires additional mechanisms to prevent +catastrophic forgetting during RL fine-tuning. We propose imitation +bootstrapped reinforcement learning (IBRL), a novel framework that first trains +an IL policy on a limited number of demonstrations and then uses it to propose +alternative actions for both online exploration and target value bootstrapping. +IBRL achieves SoTA performance and sample efficiency on 7 challenging sparse +reward continuous control tasks in simulation while learning directly from +pixels. As a highlight of our method, IBRL achieves $6.4\times$ higher success +rate than RLPD, a strong method that combines the idea of oversampling +demonstrations with modern RL improvements, under the budget of 10 demos and +100K interactions in the challenging PickPlaceCan task in the Robomimic +benchmark. + +
+
+
+
+
+ + ♻ ☆ Bounding Box-based Multi-objective Bayesian Optimization of Risk + Measures under Input Uncertainty + + +
+ In this study, we propose a novel multi-objective Bayesian optimization +(MOBO) method to efficiently identify the Pareto front (PF) defined by risk +measures for black-box functions under the presence of input uncertainty (IU). +Existing BO methods for Pareto optimization in the presence of IU are +risk-specific or without theoretical guarantees, whereas our proposed method +addresses general risk measures and has theoretical guarantees. The basic idea +of the proposed method is to assume a Gaussian process (GP) model for the +black-box function and to construct high-probability bounding boxes for the +risk measures using the GP model. Furthermore, in order to reduce the +uncertainty of non-dominated bounding boxes, we propose a method of selecting +the next evaluation point using a maximin distance defined by the maximum value +of a quasi distance based on bounding boxes. As theoretical analysis, we prove +that the algorithm can return an arbitrary-accurate solution in a finite number +of iterations with high probability, for various risk measures such as Bayes +risk, worst-case risk, and value-at-risk. We also give a theoretical analysis +that takes into account approximation errors because there exist non-negligible +approximation errors (e.g., finite approximation of PFs and sampling-based +approximation of bounding boxes) in practice. We confirm that the proposed +method outperforms compared with existing methods not only in the setting with +IU but also in the setting of ordinary MOBO through numerical experiments. + +
+
+ comment: 39 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Towards Reliable Uncertainty Quantification via Deep Ensembles in + Multi-output Regression Task + + +
+ This study aims to comprehensively investigate the deep ensemble approach, an +approximate Bayesian inference, in the multi-output regression task for +predicting the aerodynamic performance of a missile configuration. To this end, +the effect of the number of neural networks used in the ensemble, which has +been blindly adopted in previous studies, is scrutinized. As a result, an +obvious trend towards underestimation of uncertainty as it increases is +observed for the first time, and in this context, we propose the deep ensemble +framework that applies the post-hoc calibration method to improve its +uncertainty quantification performance. It is compared with Gaussian process +regression and is shown to have superior performance in terms of regression +accuracy ($\uparrow55\sim56\%$), reliability of estimated uncertainty +($\uparrow38\sim77\%$), and training efficiency ($\uparrow78\%$). Finally, the +potential impact of the suggested framework on the Bayesian optimization is +briefly examined, indicating that deep ensemble without calibration may lead to +unintended exploratory behavior. This UQ framework can be seamlessly applied +and extended to any regression task, as no special assumptions have been made +for the specific problem used in this study. + +
+
+
+
+
+ + ♻ ☆ Soft Random Sampling: A Theoretical and Empirical Analysis + + +
+ Soft random sampling (SRS) is a simple yet effective approach for efficient +training of large-scale deep neural networks when dealing with massive data. +SRS selects a subset uniformly at random with replacement from the full data +set in each epoch. In this paper, we conduct a theoretical and empirical +analysis of SRS. First, we analyze its sampling dynamics including data +coverage and occupancy. Next, we investigate its convergence with non-convex +objective functions and give the convergence rate. Finally, we provide its +generalization performance. We empirically evaluate SRS for image recognition +on CIFAR10 and automatic speech recognition on Librispeech and an in-house +payload dataset to demonstrate its effectiveness. Compared to existing +coreset-based data selection methods, SRS offers a better accuracy-efficiency +trade-off. Especially on real-world industrial scale data sets, it is shown to +be a powerful training strategy with significant speedup and competitive +performance with almost no additional computing cost. + +
+
+
+
+
+ + ♻ ☆ Improving Out-of-Distribution Detection in Echocardiographic View + Classication through Enhancing Semantic Features + + +
+ In echocardiographic view classification, accurately detecting +out-of-distribution (OOD) data is essential but challenging, especially given +the subtle differences between in-distribution and OOD data. While conventional +OOD detection methods, such as Mahalanobis distance (MD) are effective in +far-OOD scenarios with clear distinctions between distributions, they struggle +to discern the less obvious variations characteristic of echocardiographic +data. In this study, we introduce a novel use of label smoothing to enhance +semantic feature representation in echocardiographic images, demonstrating that +these enriched semantic features are key for significantly improving near-OOD +instance detection. By combining label smoothing with MD-based OOD detection, +we establish a new benchmark for accuracy in echocardiographic OOD detection. + +
+
+
+
+
+ + ♻ ☆ BrainWash: A Poisoning Attack to Forget in Continual Learning + + +
+ Continual learning has gained substantial attention within the deep learning +community, offering promising solutions to the challenging problem of +sequential learning. Yet, a largely unexplored facet of this paradigm is its +susceptibility to adversarial attacks, especially with the aim of inducing +forgetting. In this paper, we introduce "BrainWash," a novel data poisoning +method tailored to impose forgetting on a continual learner. By adding the +BrainWash noise to a variety of baselines, we demonstrate how a trained +continual learner can be induced to forget its previously learned tasks +catastrophically, even when using these continual learning baselines. An +important feature of our approach is that the attacker requires no access to +previous tasks' data and is armed merely with the model's current parameters +and the data belonging to the most recent task. Our extensive experiments +highlight the efficacy of BrainWash, showcasing degradation in performance +across various regularization-based continual learning methods. + +
+
+
+
+
+ + ♻ ☆ Proving Test Set Contamination in Black Box Language Models + + +
+ Large language models are trained on vast amounts of internet data, prompting +concerns and speculation that they have memorized public benchmarks. Going from +speculation to proof of contamination is challenging, as the pretraining data +used by proprietary models are often not publicly accessible. We show that it +is possible to provide provable guarantees of test set contamination in +language models without access to pretraining data or model weights. Our +approach leverages the fact that when there is no data contamination, all +orderings of an exchangeable benchmark should be equally likely. In contrast, +the tendency for language models to memorize example order means that a +contaminated language model will find certain canonical orderings to be much +more likely than others. Our test flags potential contamination whenever the +likelihood of a canonically ordered benchmark dataset is significantly higher +than the likelihood after shuffling the examples. We demonstrate that our +procedure is sensitive enough to reliably prove test set contamination in +challenging situations, including models as small as 1.4 billion parameters, on +small test sets of only 1000 examples, and datasets that appear only a few +times in the pretraining corpus. Using our test, we audit five popular publicly +accessible language models for test set contamination and find little evidence +for pervasive contamination. + +
+
+
+
+
+ + ♻ ☆ Spatio-Temporal Graph Neural Networks for Predictive Learning in Urban + Computing: A Survey + + +
+ With recent advances in sensing technologies, a myriad of spatio-temporal +data has been generated and recorded in smart cities. Forecasting the evolution +patterns of spatio-temporal data is an important yet demanding aspect of urban +computing, which can enhance intelligent management decisions in various +fields, including transportation, environment, climate, public safety, +healthcare, and others. Traditional statistical and deep learning methods +struggle to capture complex correlations in urban spatio-temporal data. To this +end, Spatio-Temporal Graph Neural Networks (STGNN) have been proposed, +achieving great promise in recent years. STGNNs enable the extraction of +complex spatio-temporal dependencies by integrating graph neural networks +(GNNs) and various temporal learning methods. In this manuscript, we provide a +comprehensive survey on recent progress on STGNN technologies for predictive +learning in urban computing. Firstly, we provide a brief introduction to the +construction methods of spatio-temporal graph data and the prevalent +deep-learning architectures used in STGNNs. We then sort out the primary +application domains and specific predictive learning tasks based on existing +literature. Afterward, we scrutinize the design of STGNNs and their combination +with some advanced technologies in recent years. Finally, we conclude the +limitations of existing research and suggest potential directions for future +work. + +
+
+
+
+
+ + ♻ ☆ Training Multi-Layer Over-Parametrized Neural Network in Subquadratic + Time + + +
+ We consider the problem of training a multi-layer over-parametrized neural +network to minimize the empirical risk induced by a loss function. In the +typical setting of over-parametrization, the network width $m$ is much larger +than the data dimension $d$ and the number of training samples $n$ +($m=\mathrm{poly}(n,d)$), which induces a prohibitive large weight matrix $W\in +\mathbb{R}^{m\times m}$ per layer. Naively, one has to pay $O(m^2)$ time to +read the weight matrix and evaluate the neural network function in both forward +and backward computation. In this work, we show how to reduce the training cost +per iteration. Specifically, we propose a framework that uses $m^2$ cost only +in the initialization phase and achieves \emph{a truly subquadratic cost per +iteration} in terms of $m$, i.e., $m^{2-\Omega(1)}$ per iteration. Our result +has implications beyond standard over-parametrization theory, as it can be +viewed as designing an efficient data structure on top of a pre-trained large +model to further speed up the fine-tuning process, a core procedure to deploy +large language models (LLM). + +
+
+ comment: ITCS 2024 +
+
+
+
+
+ + ♻ ☆ Mitigating Over-Smoothing and Over-Squashing using Augmentations of + Forman-Ricci Curvature + + +
+ While Graph Neural Networks (GNNs) have been successfully leveraged for +learning on graph-structured data across domains, several potential pitfalls +have been described recently. Those include the inability to accurately +leverage information encoded in long-range connections (over-squashing), as +well as difficulties distinguishing the learned representations of nearby nodes +with growing network depth (over-smoothing). An effective way to characterize +both effects is discrete curvature: Long-range connections that underlie +over-squashing effects have low curvature, whereas edges that contribute to +over-smoothing have high curvature. This observation has given rise to rewiring +techniques, which add or remove edges to mitigate over-smoothing and +over-squashing. Several rewiring approaches utilizing graph characteristics, +such as curvature or the spectrum of the graph Laplacian, have been proposed. +However, existing methods, especially those based on curvature, often require +expensive subroutines and careful hyperparameter tuning, which limits their +applicability to large-scale graphs. Here we propose a rewiring technique based +on Augmented Forman-Ricci curvature (AFRC), a scalable curvature notation, +which can be computed in linear time. We prove that AFRC effectively +characterizes over-smoothing and over-squashing effects in message-passing +GNNs. We complement our theoretical results with experiments, which demonstrate +that the proposed approach achieves state-of-the-art performance while +significantly reducing the computational cost in comparison with other methods. +Utilizing fundamental properties of discrete curvature, we propose effective +heuristics for hyperparameters in curvature-based rewiring, which avoids +expensive hyperparameter searches, further improving the scalability of the +proposed approach. + +
+
+
+
+
+ + ♻ ☆ Learning Unsupervised World Models for Autonomous Driving via Discrete + Diffusion + + +
+ Learning world models can teach an agent how the world works in an +unsupervised manner. Even though it can be viewed as a special case of sequence +modeling, progress for scaling world models on robotic applications such as +autonomous driving has been somewhat less rapid than scaling language models +with Generative Pre-trained Transformers (GPT). We identify two reasons as +major bottlenecks: dealing with complex and unstructured observation space, and +having a scalable generative model. Consequently, we propose a novel world +modeling approach that first tokenizes sensor observations with VQVAE, then +predicts the future via discrete diffusion. To efficiently decode and denoise +tokens in parallel, we recast Masked Generative Image Transformer into the +discrete diffusion framework with a few simple changes, resulting in notable +improvement. When applied to learning world models on point cloud observations, +our model reduces prior SOTA Chamfer distance by more than 65% for 1s +prediction, and more than 50% for 3s prediction, across NuScenes, KITTI +Odometry, and Argoverse2 datasets. Our results demonstrate that discrete +diffusion on tokenized agent experience can unlock the power of GPT-like +unsupervised learning for robotic agents. + +
+
+
+
+
+
+
+
+ + Multimedia 3 + +
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Accurate Speech Emotion Recognition + + +
+ Contrastive cross-modality pretraining has recently exhibited impressive +success in diverse fields, whereas there is limited research on their merits in +speech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind +of gender-attribute-enhanced contrastive language-audio pretraining (CLAP) +method for SER. Specifically, we first construct an effective emotion CLAP +(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given +the significance of gender information in SER, two novel multi-task learning +based GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP) +models are further proposed to incorporate gender information of speech +signals, forming more reasonable objectives. Experiments on IEMOCAP indicate +that our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with +different pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP +obtains the best UAR of 81.43\% and WAR of 83.16\%, which performs better than +state-of-the-art SER methods. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal + Relationships + + +
+ Integrating deep learning and causal discovery has increased the +interpretability of Temporal Action Segmentation (TAS) tasks. However, +frame-level causal relationships exist many complicated noises outside the +segment-level, making it infeasible to directly express macro action semantics. +Thus, we propose Causal Abstraction Segmentation Refiner (CASR), which can +refine TAS results from various models by enhancing video causality in +marginalizing frame-level casual relationships. Specifically, we define the +equivalent frame-level casual model and segment-level causal model, so that the +causal adjacency matrix constructed from marginalized frame-level causal +relationships has the ability to represent the segmnet-level causal +relationships. CASR works out by reducing the difference in the causal +adjacency matrix between we constructed and pre-segmentation results of +backbone models. In addition, we propose a novel evaluation metric Causal Edit +Distance (CED) to evaluate the causal interpretability. Extensive experimental +results on mainstream datasets indicate that CASR significantly surpasses +existing various methods in action segmentation performance, as well as in +causal explainability and generalization. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Visual Acoustic Matching NeurIPS 2023 + + +
+ Acoustic matching aims to re-synthesize an audio clip to sound as if it were +recorded in a target acoustic environment. Existing methods assume access to +paired training data, where the audio is observed in both source and target +environments, but this limits the diversity of training data or requires the +use of simulated data or heuristics to create paired samples. We propose a +self-supervised approach to visual acoustic matching where training samples +include only the target scene image and audio -- without acoustically +mismatched source audio for reference. Our approach jointly learns to +disentangle room acoustics and re-synthesize audio into the target environment, +via a conditional GAN framework and a novel metric that quantifies the level of +residual acoustic information in the de-biased audio. Training with either +in-the-wild web data or simulated data, we demonstrate it outperforms the +state-of-the-art on multiple challenging datasets and a wide variety of +real-world audio and environments. + +
+
+ comment: Project page: https://vision.cs.utexas.edu/projects/ss_vam/ . + Accepted at NeurIPS 2023 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 36 + +
+
+
+ + ☆ Annotation Sensitivity: Training Data Collection Methods Affect Model + Performance EMNLP 2023 + + +
+ When training data are collected from human annotators, the design of the +annotation instrument, the instructions given to annotators, the +characteristics of the annotators, and their interactions can impact training +data. This study demonstrates that design choices made when creating an +annotation instrument also impact the models trained on the resulting +annotations. + We introduce the term annotation sensitivity to refer to the impact of +annotation data collection methods on the annotations themselves and on +downstream model performance and predictions. + We collect annotations of hate speech and offensive language in five +experimental conditions of an annotation instrument, randomly assigning +annotators to conditions. We then fine-tune BERT models on each of the five +resulting datasets and evaluate model performance on a holdout portion of each +condition. We find considerable differences between the conditions for 1) the +share of hate speech/offensive language annotations, 2) model performance, 3) +model predictions, and 4) model learning curves. + Our results emphasize the crucial role played by the annotation instrument +which has received little attention in the machine learning literature. We call +for additional research into how and why the instrument impacts the annotations +to inform the development of best practices in instrument design. + +
+
+ comment: EMNLP 2023 Findings +
+
+
+
+
+ + ☆ A Systematic Review of Deep Learning-based Research on Radiology Report + Generation + + +
+ Radiology report generation (RRG) aims to automatically generate free-text +descriptions from clinical radiographs, e.g., chest X-Ray images. RRG plays an +essential role in promoting clinical automation and presents significant help +to provide practical assistance for inexperienced doctors and alleviate +radiologists' workloads. Therefore, consider these meaningful potentials, +research on RRG is experiencing explosive growth in the past half-decade, +especially with the rapid development of deep learning approaches. Existing +studies perform RRG from the perspective of enhancing different modalities, +provide insights on optimizing the report generation process with elaborated +features from both visual and textual information, and further facilitate RRG +with the cross-modal interactions among them. In this paper, we present a +comprehensive review of deep learning-based RRG from various perspectives. +Specifically, we firstly cover pivotal RRG approaches based on the +task-specific features of radiographs, reports, and the cross-modal relations +between them, and then illustrate the benchmark datasets conventionally used +for this task with evaluation metrics, subsequently analyze the performance of +different approaches and finally offer our summary on the challenges and the +trends in future directions. Overall, the goal of this paper is to serve as a +tool for understanding existing literature and inspiring potential valuable +research in the field of RRG. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ☆ Evaluating GPT-4's Vision Capabilities on Brazilian University Admission + Exams + + +
+ Recent advancements in language models have showcased human-comparable +performance in academic entrance exams. However, existing studies often +overlook questions that require the integration of visual comprehension, thus +compromising the full spectrum and complexity inherent in real-world scenarios. +To address this gap, we present a comprehensive framework to evaluate language +models on entrance exams, which incorporates both textual and visual elements. +We evaluate the two most recent editions of Exame Nacional do Ensino M\'edio +(ENEM), the main standardized entrance examination adopted by Brazilian +universities. Our study not only reaffirms the capabilities of GPT-4 as the +state of the art for handling complex multidisciplinary questions, but also +pioneers in offering a realistic assessment of multimodal language models on +Portuguese examinations. One of the highlights is that text captions +transcribing visual content outperform the direct use of images, suggesting +that the vision model has room for improvement. Yet, despite improvements +afforded by images or captions, mathematical questions remain a challenge for +these state-of-the-art models. The code and data used on experiments are +available at https://github.com/piresramon/gpt-4-enem. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.17003 +
+
+
+
+
+ + ☆ Towards Auditing Large Language Models: Improving Text-based Stereotype + Detection NeurIPS + + +
+ Large Language Models (LLM) have made significant advances in the recent past +becoming more mainstream in Artificial Intelligence (AI) enabled human-facing +applications. However, LLMs often generate stereotypical output inherited from +historical data, amplifying societal biases and raising ethical concerns. This +work introduces i) the Multi-Grain Stereotype Dataset, which includes 52,751 +instances of gender, race, profession and religion stereotypic text and ii) a +novel stereotype classifier for English text. We design several experiments to +rigorously test the proposed model trained on the novel dataset. Our +experiments show that training the model in a multi-class setting can +outperform the one-vs-all binary counterpart. Consistent feature importance +signals from different eXplainable AI tools demonstrate that the new model +exploits relevant text features. We utilise the newly created model to assess +the stereotypic behaviour of the popular GPT family of models and observe the +reduction of bias over time. In summary, our work establishes a robust and +practical framework for auditing and evaluating the stereotypic bias in LLM. + +
+
+ comment: 2023 NeurIPS SoLaR Workshop Accepted +
+
+
+
+
+ + ☆ A density estimation perspective on learning from pairwise human + preferences + + +
+ Learning from human feedback (LHF) -- and in particular learning from +pairwise preferences -- has recently become a crucial ingredient in training +large language models (LLMs), and has been the subject of much research. Most +recent works frame it as a reinforcement learning problem, where a reward +function is learned from pairwise preference data and the LLM is treated as a +policy which is adapted to maximize the rewards, often under additional +regularization constraints. We propose an alternative interpretation which +centers on the generative process for pairwise preferences and treats LHF as a +density estimation problem. We provide theoretical and empirical results +showing that for a family of generative processes defined via preference +behavior distribution equations, training a reward function on pairwise +preferences effectively models an annotator's implicit preference distribution. +Finally, we discuss and present findings on "annotator misspecification" -- +failure cases where wrong modeling assumptions are made about annotator +behavior, resulting in poorly-adapted models -- suggesting that approaches that +learn from pairwise human preferences could have trouble learning from a +population of annotators with diverse viewpoints. + +
+
+
+
+
+ + ☆ Auditing and Mitigating Cultural Bias in LLMs + + +
+ Culture fundamentally shapes people's reasoning, behavior, and communication. +Generative artificial intelligence (AI) technologies may cause a shift towards +a dominant culture. As people increasingly use AI to expedite and even automate +various professional and personal tasks, cultural values embedded in AI models +may bias authentic expression. We audit large language models for cultural +bias, comparing their responses to nationally representative survey data, and +evaluate country-specific prompting as a mitigation strategy. We find that +GPT-4, 3.5 and 3 exhibit cultural values resembling English-speaking and +Protestant European countries. Our mitigation strategy reduces cultural bias in +recent models but not for all countries/territories. To avoid cultural bias in +generative AI, especially in high-stakes contexts, we suggest using culture +matching and ongoing cultural audits. + +
+
+
+
+
+ + ☆ Question Answering in Natural Language: the Special Case of Temporal + Expressions + + +
+ Although general question answering has been well explored in recent years, +temporal question answering is a task which has not received as much focus. Our +work aims to leverage a popular approach used for general question answering, +answer extraction, in order to find answers to temporal questions within a +paragraph. To train our model, we propose a new dataset, inspired by SQuAD, +specifically tailored to provide rich temporal information. We chose to adapt +the corpus WikiWars, which contains several documents on history's greatest +conflicts. Our evaluation shows that a deep learning model trained to perform +pattern matching, often used in general question answering, can be adapted to +temporal question answering, if we accept to ask questions whose answers must +be directly present within a text. + +
+
+ comment: Accepted at Student Research Workshop associated with RANLP-2021 +
+
+
+
+
+ + ☆ Searching for Snippets of Open-Domain Dialogue in Task-Oriented Dialogue + Datasets + + +
+ Most existing dialogue corpora and models have been designed to fit into 2 +predominant categories : task-oriented dialogues portray functional goals, such +as making a restaurant reservation or booking a plane ticket, while +chit-chat/open-domain dialogues focus on holding a socially engaging talk with +a user. However, humans tend to seamlessly switch between modes and even use +chitchat to enhance task-oriented conversations. To bridge this gap, new +datasets have recently been created, blending both communication modes into +conversation examples. The approaches used tend to rely on adding chit-chat +snippets to pre-existing, human-generated task-oriented datasets. Given the +tendencies observed in humans, we wonder however if the latter do not +\textit{already} hold chit-chat sequences. By using topic modeling and +searching for topics which are most similar to a set of keywords related to +social talk, we explore the training sets of Schema-Guided Dialogues and +MultiWOZ. Our study shows that sequences related to social talk are indeed +naturally present, motivating further research on ways chitchat is combined +into task-oriented dialogues. + +
+
+
+
+
+ + ☆ Enhancing Task-Oriented Dialogues with Chitchat: a Comparative Study + Based on Lexical Diversity and Divergence + + +
+ As a recent development, task-oriented dialogues (TODs) have been enriched +with chitchat in an effort to make dialogues more diverse and engaging. This +enhancement is particularly valuable as TODs are often confined to narrow +domains, making the mitigation of repetitive and predictable responses a +significant challenge. This paper presents a comparative analysis of three +chitchat enhancements, aiming to identify the most effective approach in terms +of diversity. Additionally, we quantify the divergence between the added +chitchat, the original task-oriented language, and chitchat typically found in +chitchat datasets, highlighting the top 20 divergent keywords for each +comparison. Our findings drive a discussion on future enhancements for +augmenting TODs, emphasizing the importance of grounding dialogues beyond the +task to achieve more diverse and natural exchanges. + +
+
+ comment: Accepted at ASRU 2023 +
+
+
+
+
+ + ☆ Do VSR Models Generalize Beyond LRS3? + + +
+ The Lip Reading Sentences-3 (LRS3) benchmark has primarily been the focus of +intense research in visual speech recognition (VSR) during the last few years. +As a result, there is an increased risk of overfitting to its excessively used +test set, which is only one hour duration. To alleviate this issue, we build a +new VSR test set named WildVSR, by closely following the LRS3 dataset creation +processes. We then evaluate and analyse the extent to which the current VSR +models generalize to the new test data. We evaluate a broad range of publicly +available VSR models and find significant drops in performance on our test set, +compared to their corresponding LRS3 results. Our results suggest that the +increase in word error rates is caused by the models inability to generalize to +slightly harder and in the wild lip sequences than those found in the LRS3 test +set. Our new test benchmark is made public in order to enable future research +towards more robust VSR models. + +
+
+
+
+
+ + ☆ Jam-ALT: A Formatting-Aware Lyrics Transcription Benchmark + + +
+ Current automatic lyrics transcription (ALT) benchmarks focus exclusively on +word content and ignore the finer nuances of written lyrics including +formatting and punctuation, which leads to a potential misalignment with the +creative products of musicians and songwriters as well as listeners' +experiences. For example, line breaks are important in conveying information +about rhythm, emotional emphasis, rhyme, and high-level structure. To address +this issue, we introduce Jam-ALT, a new lyrics transcription benchmark based on +the JamendoLyrics dataset. Our contribution is twofold. Firstly, a complete +revision of the transcripts, geared specifically towards ALT evaluation by +following a newly created annotation guide that unifies the music industry's +guidelines, covering aspects such as punctuation, line breaks, spelling, +background vocals, and non-word sounds. Secondly, a suite of evaluation metrics +designed, unlike the traditional word error rate, to capture such phenomena. We +hope that the proposed benchmark contributes to the ALT task, enabling more +precise and reliable assessments of transcription systems and enhancing the +user experience in lyrics applications such as subtitle renderings for live +captioning or karaoke. + +
+
+ comment: 6 pages (3 pages main content); website: + https://audioshake.github.io/jam-alt/; data: + https://huggingface.co/datasets/audioshake/jam-alt; code: + https://github.com/audioshake/alt-eval/ +
+
+
+
+
+ + ☆ Probabilistic Tree-of-thought Reasoning for Answering + Knowledge-intensive Complex Questions EMNLP 2023 + + +
+ Large language models (LLMs) are capable of answering knowledge-intensive +complex questions with chain-of-thought (CoT) reasoning. However, they tend to +generate factually incorrect reasoning steps when the required knowledge is not +available or up-to-date in models' parameters. Recent works turn to retrieving +external knowledge to augment CoT reasoning. Despite being promising, these +chain-based methods suffer from: 1) Negative retrieval. Unnecessary or +incorrect retrieval may mislead the reasoning; 2) Limited sight. Lacking the +ability to look backward or forward, a local error in one step will propagate +along the chain. + In this paper, we propose a novel approach: Probabilistic Tree-of-thought +Reasoning (ProbTree). First, LLMs translate a complex question into a query +tree, in which each non-root node denotes a sub-question of its parent node. +Then, probabilistic reasoning is conducted over the tree, by solving questions +from leaf to root considering the confidence of both question decomposing and +answering. During reasoning, for leaf nodes, LLMs choose a more confident +answer from Closed-book QA that employs parametric knowledge and Open-book QA +that employs retrieved external knowledge, thus eliminating the negative +retrieval problem. For non-leaf nodes, with the hierarchical structure, LLMs +have broader sights and are able to globally reason with the information from +child nodes, thus recovering from local errors. The experiments on three +Complex QA datasets under the open-domain setting show that our approach +outperforms SOTA methods significantly, demonstrating the effect of +probabilistic tree-of-thought reasoning. + +
+
+ comment: Accepted by EMNLP 2023 +
+
+
+
+
+ + ☆ Efficient Trigger Word Insertion + + +
+ With the boom in the natural language processing (NLP) field these years, +backdoor attacks pose immense threats against deep neural network models. +However, previous works hardly consider the effect of the poisoning rate. In +this paper, our main objective is to reduce the number of poisoned samples +while still achieving a satisfactory Attack Success Rate (ASR) in text backdoor +attacks. To accomplish this, we propose an efficient trigger word insertion +strategy in terms of trigger word optimization and poisoned sample selection. +Extensive experiments on different datasets and models demonstrate that our +proposed method can significantly improve attack effectiveness in text +classification tasks. Remarkably, our approach achieves an ASR of over 90% with +only 10 poisoned samples in the dirty-label setting and requires merely 1.5% of +the training data in the clean-label setting. + +
+
+
+
+
+ + ☆ MLLM-Bench, Evaluating Multi-modal LLMs using GPT-4V + + +
+ In the pursuit of Artificial General Intelligence (AGI), the integration of +vision in language models has marked a significant milestone. The advent of +vision-language models (MLLMs) like GPT-4V have expanded AI applications, +aligning with the multi-modal capabilities of the human brain. However, +evaluating the efficacy of MLLMs poses a substantial challenge due to the +subjective nature of tasks that lack definitive answers. Existing automatic +evaluation methodologies on multi-modal large language models rely on objective +queries that have standard answers, inadequately addressing the nuances of +creative and associative multi-modal tasks. To address this, we introduce +MLLM-Bench, an innovative benchmark inspired by Vicuna, spanning a diverse +array of scenarios, including Perception, Understanding, Applying, Analyzing, +Evaluating, and Creation along with the ethical consideration. MLLM-Bench is +designed to reflect user experience more accurately and provide a more holistic +assessment of model performance. Comparative evaluations indicate a significant +performance gap between existing open-source models and GPT-4V. We posit that +MLLM-Bench will catalyze progress in the open-source community towards +developing user-centric vision-language models that meet a broad spectrum of +real-world applications. See online leaderboard in +\url{https://mllm-bench.llmzoo.com}. + +
+
+
+
+
+ + ☆ Exploring Methods for Cross-lingual Text Style Transfer: The Case of + Text Detoxification AACL 2023 + + +
+ Text detoxification is the task of transferring the style of text from toxic +to neutral. While here are approaches yielding promising results in monolingual +setup, e.g., (Dale et al., 2021; Hallinan et al., 2022), cross-lingual transfer +for this task remains a challenging open problem (Moskovskiy et al., 2022). In +this work, we present a large-scale study of strategies for cross-lingual text +detoxification -- given a parallel detoxification corpus for one language; the +goal is to transfer detoxification ability to another language for which we do +not have such a corpus. Moreover, we are the first to explore a new task where +text translation and detoxification are performed simultaneously, providing +several strong baselines for this task. Finally, we introduce new automatic +detoxification evaluation metrics with higher correlations with human judgments +than previous benchmarks. We assess the most promising approaches also with +manual markup, determining the answer for the best strategy to transfer the +knowledge of text detoxification between languages. + +
+
+ comment: AACL 2023, main conference, long paper +
+
+
+
+
+ + ☆ Some Like It Small: Czech Semantic Embedding Models for Industry + Applications + + +
+ This article focuses on the development and evaluation of Small-sized Czech +sentence embedding models. Small models are important components for real-time +industry applications in resource-constrained environments. Given the limited +availability of labeled Czech data, alternative approaches, including +pre-training, knowledge distillation, and unsupervised contrastive fine-tuning, +are investigated. Comprehensive intrinsic and extrinsic analyses are conducted, +showcasing the competitive performance of our models compared to significantly +larger counterparts, with approximately 8 times smaller size and 5 times faster +speed than conventional Base-sized models. To promote cooperation and +reproducibility, both the models and the evaluation pipeline are made publicly +accessible. Ultimately, this article presents practical applications of the +developed sentence embedding models in Seznam.cz, the Czech search engine. +These models have effectively replaced previous counterparts, enhancing the +overall search experience for instance, in organic search, featured snippets, +and image search. This transition has yielded improved performance. + +
+
+ comment: Accepted at the Thirty-Sixth Annual Conference on Innovative + Applications of Artificial Intelligence (IAAI-24). IAAI Innovative + Application Award. 9 pages +
+
+
+
+
+ + ☆ Dialogue Quality and Emotion Annotations for Customer Support + Conversations EMNLP + + +
+ Task-oriented conversational datasets often lack topic variability and +linguistic diversity. However, with the advent of Large Language Models (LLMs) +pretrained on extensive, multilingual and diverse text data, these limitations +seem overcome. Nevertheless, their generalisability to different languages and +domains in dialogue applications remains uncertain without benchmarking +datasets. This paper presents a holistic annotation approach for emotion and +conversational quality in the context of bilingual customer support +conversations. By performing annotations that take into consideration the +complete instances that compose a conversation, one can form a broader +perspective of the dialogue as a whole. Furthermore, it provides a unique and +valuable resource for the development of text classification models. To this +end, we present benchmarks for Emotion Recognition and Dialogue Quality +Estimation and show that further research is needed to leverage these models in +a production setting. + +
+
+ comment: Accepted at GEM (EMNLP Workshop) +
+
+
+
+
+ + ☆ General Phrase Debiaser: Debiasing Masked Language Models at a + Multi-Token Level + + +
+ The social biases and unwelcome stereotypes revealed by pretrained language +models are becoming obstacles to their application. Compared to numerous +debiasing methods targeting word level, there has been relatively less +attention on biases present at phrase level, limiting the performance of +debiasing in discipline domains. In this paper, we propose an automatic +multi-token debiasing pipeline called \textbf{General Phrase Debiaser}, which +is capable of mitigating phrase-level biases in masked language models. +Specifically, our method consists of a \textit{phrase filter stage} that +generates stereotypical phrases from Wikipedia pages as well as a \textit{model +debias stage} that can debias models at the multi-token level to tackle bias +challenges on phrases. The latter searches for prompts that trigger model's +bias, and then uses them for debiasing. State-of-the-art results on standard +datasets and metrics show that our approach can significantly reduce gender +biases on both career and multiple disciplines, across models with varying +parameter sizes. + +
+
+
+
+
+ + ☆ Minimizing Factual Inconsistency and Hallucination in Large Language + Models + + +
+ Large Language Models (LLMs) are widely used in critical fields such as +healthcare, education, and finance due to their remarkable proficiency in +various language-related tasks. However, LLMs are prone to generating factually +incorrect responses or "hallucinations," which can lead to a loss of +credibility and trust among users. To address this issue, we propose a +multi-stage framework that generates the rationale first, verifies and refines +incorrect ones, and uses them as supporting references to generate the answer. +The generated rationale enhances the transparency of the answer and our +framework provides insights into how the model arrived at this answer, by using +this rationale and the references to the context. In this paper, we demonstrate +its effectiveness in improving the quality of responses to drug-related +inquiries in the life sciences industry. Our framework improves traditional +Retrieval Augmented Generation (RAG) by enabling OpenAI GPT-3.5-turbo to be +14-25% more faithful and 16-22% more accurate on two datasets. Furthermore, +fine-tuning samples based on our framework improves the accuracy of smaller +open-access LLMs by 33-42% and competes with RAG on commercial models. + +
+
+
+
+
+ + ☆ Challenges of Large Language Models for Mental Health Counseling + + +
+ The global mental health crisis is looming with a rapid increase in mental +disorders, limited resources, and the social stigma of seeking treatment. As +the field of artificial intelligence (AI) has witnessed significant +advancements in recent years, large language models (LLMs) capable of +understanding and generating human-like text may be used in supporting or +providing psychological counseling. However, the application of LLMs in the +mental health domain raises concerns regarding the accuracy, effectiveness, and +reliability of the information provided. This paper investigates the major +challenges associated with the development of LLMs for psychological +counseling, including model hallucination, interpretability, bias, privacy, and +clinical effectiveness. We explore potential solutions to these challenges that +are practical and applicable to the current paradigm of AI. From our experience +in developing and deploying LLMs for mental health, AI holds a great promise +for improving mental health care, if we can carefully navigate and overcome +pitfalls of LLMs. + +
+
+
+
+
+ + ☆ Grammatical Error Correction via Mixed-Grained Weighted Training EMNLP2023 + + +
+ The task of Grammatical Error Correction (GEC) aims to automatically correct +grammatical errors in natural texts. Almost all previous works treat annotated +training data equally, but inherent discrepancies in data are neglected. In +this paper, the inherent discrepancies are manifested in two aspects, namely, +accuracy of data annotation and diversity of potential annotations. To this +end, we propose MainGEC, which designs token-level and sentence-level training +weights based on inherent discrepancies in accuracy and potential diversity of +data annotation, respectively, and then conducts mixed-grained weighted +training to improve the training effect for GEC. Empirical evaluation shows +that whether in the Seq2Seq or Seq2Edit manner, MainGEC achieves consistent and +significant performance improvements on two benchmark datasets, demonstrating +the effectiveness and superiority of the mixed-grained weighted training. +Further ablation experiments verify the effectiveness of designed weights of +both granularities in MainGEC. + +
+
+ comment: EMNLP2023 Findings +
+
+
+
+
+ + ☆ Lego: Learning to Disentangle and Invert Concepts Beyond Object + Appearance in Text-to-Image Diffusion Models + + +
+ Diffusion models have revolutionized generative content creation and +text-to-image (T2I) diffusion models in particular have increased the creative +freedom of users by allowing scene synthesis using natural language. T2I models +excel at synthesizing concepts such as nouns, appearances, and styles. To +enable customized content creation based on a few example images of a concept, +methods such as Textual Inversion and DreamBooth invert the desired concept and +enable synthesizing it in new scenes. However, inverting more general concepts +that go beyond object appearance and style (adjectives and verbs) through +natural language, remains a challenge. Two key characteristics of these +concepts contribute to the limitations of current inversion methods. 1) +Adjectives and verbs are entangled with nouns (subject) and can hinder +appearance-based inversion methods, where the subject appearance leaks into the +concept embedding and 2) describing such concepts often extends beyond single +word embeddings (being frozen in ice, walking on a tightrope, etc.) that +current methods do not handle. + In this study, we introduce Lego, a textual inversion method designed to +invert subject entangled concepts from a few example images. Lego disentangles +concepts from their associated subjects using a simple yet effective Subject +Separation step and employs a Context Loss that guides the inversion of +single/multi-embedding concepts. In a thorough user study, Lego-generated +concepts were preferred over 70% of the time when compared to the baseline. +Additionally, visual question answering using a large language model suggested +Lego-generated concepts are better aligned with the text description of the +concept. + +
+
+
+
+
+ + ☆ AdaTyper: Adaptive Semantic Column Type Detection VLDB'24 + + +
+ Understanding the semantics of relational tables is instrumental for +automation in data exploration and preparation systems. A key source for +understanding a table is the semantics of its columns. With the rise of deep +learning, learned table representations are now available, which can be applied +for semantic type detection and achieve good performance on benchmarks. +Nevertheless, we observe a gap between this performance and its applicability +in practice. In this paper, we propose AdaTyper to address one of the most +critical deployment challenges: adaptation. AdaTyper uses weak-supervision to +adapt a hybrid type predictor towards new semantic types and shifted data +distributions at inference time, using minimal human feedback. The hybrid type +predictor of AdaTyper combines rule-based methods and a light machine learning +model for semantic column type detection. We evaluate the adaptation +performance of AdaTyper on real-world database tables hand-annotated with +semantic column types through crowdsourcing and find that the f1-score improves +for new and existing types. AdaTyper approaches an average precision of 0.6 +after only seeing 5 examples, significantly outperforming existing adaptation +methods based on human-provided regular expressions or dictionaries. + +
+
+ comment: Submitted to VLDB'24 +
+
+
+
+
+ + ☆ DaG LLM ver 1.0: Pioneering Instruction-Tuned Language Modeling for + Korean NLP + + +
+ This paper presents the DaG LLM (David and Goliath Large Language Model), a +language model specialized for Korean and fine-tuned through Instruction Tuning +across 41 tasks within 13 distinct categories. + +
+
+
+
+
+ + ☆ Transformer-based Named Entity Recognition in Construction Supply Chain + Risk Management in Australia + + +
+ The construction industry in Australia is characterized by its intricate +supply chains and vulnerability to myriad risks. As such, effective supply +chain risk management (SCRM) becomes imperative. This paper employs different +transformer models, and train for Named Entity Recognition (NER) in the context +of Australian construction SCRM. Utilizing NER, transformer models identify and +classify specific risk-associated entities in news articles, offering a +detailed insight into supply chain vulnerabilities. By analysing news articles +through different transformer models, we can extract relevant entities and +insights related to specific risk taxonomies local (milieu) to the Australian +construction landscape. This research emphasises the potential of NLP-driven +solutions, like transformer models, in revolutionising SCRM for construction in +geo-media specific contexts. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be acceptable +
+
+
+
+
+ + ♻ ☆ A Comprehensive Overview of Large Language Models + + +
+ Large Language Models (LLMs) have recently demonstrated remarkable +capabilities in natural language processing tasks and beyond. This success of +LLMs has led to a large influx of research contributions in this direction. +These works encompass diverse topics such as architectural innovations, better +training strategies, context length improvements, fine-tuning, multi-modal +LLMs, robotics, datasets, benchmarking, efficiency, and more. With the rapid +development of techniques and regular breakthroughs in LLM research, it has +become considerably challenging to perceive the bigger picture of the advances +in this direction. Considering the rapidly emerging plethora of literature on +LLMs, it is imperative that the research community is able to benefit from a +concise yet comprehensive overview of the recent developments in this field. +This article provides an overview of the existing literature on a broad range +of LLM-related concepts. Our self-contained comprehensive overview of LLMs +discusses relevant background concepts along with covering the advanced topics +at the frontier of research in LLMs. This review article is intended to not +only provide a systematic survey but also a quick comprehensive reference for +the researchers and practitioners to draw insights from extensive informative +summaries of the existing works to advance the LLM research. + +
+
+ comment: Work in-progress +
+
+
+
+
+ + ♻ ☆ Clembench: Using Game Play to Evaluate Chat-Optimized Language Models as + Conversational Agents EMNLP 2023 + + +
+ Recent work has proposed a methodology for the systematic evaluation of +"Situated Language Understanding Agents"-agents that operate in rich linguistic +and non-linguistic contexts-through testing them in carefully constructed +interactive settings. Other recent work has argued that Large Language Models +(LLMs), if suitably set up, can be understood as (simulators of) such agents. A +connection suggests itself, which this paper explores: Can LLMs be evaluated +meaningfully by exposing them to constrained game-like settings that are built +to challenge specific capabilities? As a proof of concept, this paper +investigates five interaction settings, showing that current chat-optimised +LLMs are, to an extent, capable to follow game-play instructions. Both this +capability and the quality of the game play, measured by how well the +objectives of the different games are met, follows the development cycle, with +newer models performing better. The metrics even for the comparatively simple +example games are far from being saturated, suggesting that the proposed +instrument will remain to have diagnostic value. Our general framework for +implementing and evaluating games with LLMs is available at +https://github.com/clembench . + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ LLM aided semi-supervision for Extractive Dialog Summarization EMNLP + + +
+ Generating high-quality summaries for chat dialogs often requires large +labeled datasets. We propose a method to efficiently use unlabeled data for +extractive summarization of customer-agent dialogs. In our method, we frame +summarization as a question-answering problem and use state-of-the-art large +language models (LLMs) to generate pseudo-labels for a dialog. We then use +these pseudo-labels to fine-tune a chat summarization model, effectively +transferring knowledge from the large LLM into a smaller specialized model. We +demonstrate our method on the \tweetsumm dataset, and show that using 10% of +the original labelled data set we can achieve 65.9/57.0/61.0 ROUGE-1/-2/-L, +whereas the current state-of-the-art trained on the entire training data set +obtains 65.16/55.81/64.37 ROUGE-1/-2/-L. In other words, in the worst case +(i.e., ROUGE-L) we still effectively retain 94.7% of the performance while +using only 10% of the data. + +
+
+ comment: to be published in EMNLP Findings +
+
+
+
+
+ + ♻ ☆ Rethinking Attention: Exploring Shallow Feed-Forward Neural Networks as + an Alternative to Attention Layers in Transformers AAAI24 + + +
+ This work presents an analysis of the effectiveness of using standard shallow +feed-forward networks to mimic the behavior of the attention mechanism in the +original Transformer model, a state-of-the-art architecture for +sequence-to-sequence tasks. We substitute key elements of the attention +mechanism in the Transformer with simple feed-forward networks, trained using +the original components via knowledge distillation. Our experiments, conducted +on the IWSLT2017 dataset, reveal the capacity of these "attentionless +Transformers" to rival the performance of the original architecture. Through +rigorous ablation studies, and experimenting with various replacement network +types and sizes, we offer insights that support the viability of our approach. +This not only sheds light on the adaptability of shallow feed-forward networks +in emulating attention mechanisms but also underscores their potential to +streamline complex architectures for sequence-to-sequence tasks. + +
+
+ comment: Accepted at AAAI24(https://aaai.org/aaai-conference/) +
+
+
+
+
+ + ♻ ☆ ProAgent: From Robotic Process Automation to Agentic Process Automation + + +
+ From ancient water wheels to robotic process automation (RPA), automation +technology has evolved throughout history to liberate human beings from arduous +tasks. Yet, RPA struggles with tasks needing human-like intelligence, +especially in elaborate design of workflow construction and dynamic +decision-making in workflow execution. As Large Language Models (LLMs) have +emerged human-like intelligence, this paper introduces Agentic Process +Automation (APA), a groundbreaking automation paradigm using LLM-based agents +for advanced automation by offloading the human labor to agents associated with +construction and execution. We then instantiate ProAgent, an LLM-based agent +designed to craft workflows from human instructions and make intricate +decisions by coordinating specialized agents. Empirical experiments are +conducted to detail its construction and execution procedure of workflow, +showcasing the feasibility of APA, unveiling the possibility of a new paradigm +of automation driven by agents. Our code is public at +https://github.com/OpenBMB/ProAgent. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Causal Inference from Text: Unveiling Interactions between Variables EMNLP 2023 + + +
+ Adjusting for latent covariates is crucial for estimating causal effects from +observational textual data. Most existing methods only account for confounding +covariates that affect both treatment and outcome, potentially leading to +biased causal effects. This bias arises from insufficient consideration of +non-confounding covariates, which are relevant only to either the treatment or +the outcome. In this work, we aim to mitigate the bias by unveiling +interactions between different variables to disentangle the non-confounding +covariates when estimating causal effects from text. The disentangling process +ensures covariates only contribute to their respective objectives, enabling +independence between variables. Additionally, we impose a constraint to balance +representations from the treatment group and control group to alleviate +selection bias. We conduct experiments on two different treatment factors under +various scenarios, and the proposed model significantly outperforms recent +strong baselines. Furthermore, our thorough analysis on earnings call +transcripts demonstrates that our model can effectively disentangle the +variables, and further investigations into real-world scenarios provide +guidance for investors to make informed decisions. + +
+
+ comment: EMNLP 2023 Findings (mark typo corrected) +
+
+
+
+
+ + ♻ ☆ TrainerAgent: Customizable and Efficient Model Training through + LLM-Powered Multi-Agent System + + +
+ Training AI models has always been challenging, especially when there is a +need for custom models to provide personalized services. Algorithm engineers +often face a lengthy process to iteratively develop models tailored to specific +business requirements, making it even more difficult for non-experts. The quest +for high-quality and efficient model development, along with the emergence of +Large Language Model (LLM) Agents, has become a key focus in the industry. +Leveraging the powerful analytical, planning, and decision-making capabilities +of LLM, we propose a TrainerAgent system comprising a multi-agent framework +including Task, Data, Model and Server agents. These agents analyze +user-defined tasks, input data, and requirements (e.g., accuracy, speed), +optimizing them comprehensively from both data and model perspectives to obtain +satisfactory models, and finally deploy these models as online service. +Experimental evaluations on classical discriminative and generative tasks in +computer vision and natural language processing domains demonstrate that our +system consistently produces models that meet the desired criteria. +Furthermore, the system exhibits the ability to critically identify and reject +unattainable tasks, such as fantastical scenarios or unethical requests, +ensuring robustness and safety. This research presents a significant +advancement in achieving desired models with increased efficiency and quality +as compared to traditional model development, facilitated by the integration of +LLM-powered analysis, decision-making, and execution capabilities, as well as +the collaboration among four agents. We anticipate that our work will +contribute to the advancement of research on TrainerAgent in both academic and +industry communities, potentially establishing it as a new paradigm for model +development in the field of AI. + +
+
+
+
+
+ + ♻ ☆ ChiMed-GPT: A Chinese Medical Large Language Model with Full Training + Regime and Better Alignment to Human Preferences + + +
+ Recently, the increasing demand for superior medical services has highlighted +the discrepancies in the medical infrastructure. With big data, especially +texts, forming the foundation of medical services, there is an exigent need for +effective natural language processing (NLP) solutions tailored to the +healthcare domain. Conventional approaches leveraging pre-trained models +present promising results in this domain and current large language models +(LLMs) offer advanced foundation for medical text processing. However, most +medical LLMs are trained only with supervised fine-tuning (SFT), even though it +efficiently empowers LLMs to understand and respond to medical instructions but +is ineffective in learning domain knowledge and aligning with human preference. +Another engineering barrier that prevents current medical LLM from better text +processing ability is their restricted context length (e.g., 2,048 tokens), +making it hard for the LLMs to process long context, which is frequently +required in the medical domain. In this work, we propose ChiMed-GPT, a new +benchmark LLM designed explicitly for Chinese medical domain, with enlarged +context length to 4,096 tokens and undergoes a comprehensive training regime +with pre-training, SFT, and RLHF. Evaluations on real-world tasks including +information extraction, question answering, and dialogue generation demonstrate +ChiMed-GPT's superior performance over general domain LLMs. Furthermore, we +analyze possible biases through prompting ChiMed-GPT to perform attitude scales +regarding discrimination of patients, so as to contribute to further +responsible development of LLMs in the medical domain. The code and model are +released at https://github.com/synlp/ChiMed-GPT. + +
+
+ comment: 17 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Exploration with Principles for Diverse AI Supervision + + +
+ Training large transformers using next-token prediction has given rise to +groundbreaking advancements in AI. While this generative AI approach has +produced impressive results, it heavily leans on human supervision. Even +state-of-the-art AI models like ChatGPT depend on fine-tuning through human +demonstrations, demanding extensive human input and domain expertise. This +strong reliance on human oversight poses a significant hurdle to the +advancement of AI innovation. To address this limitation, we propose a novel +paradigm termed Exploratory AI (EAI) aimed at autonomously generating +high-quality training data. Drawing inspiration from unsupervised reinforcement +learning (RL) pretraining, EAI achieves exploration within the natural language +space. We accomplish this by harnessing large language models to assess the +novelty of generated content. Our approach employs two key components: an actor +that generates novel content following exploration principles and a critic that +evaluates the generated content, offering critiques to guide the actor. +Empirical evaluations demonstrate that EAI significantly boosts model +performance on complex reasoning tasks, addressing the limitations of +human-intensive supervision. + +
+
+
+
+
+ + ♻ ☆ Fewer is More: Trojan Attacks on Parameter-Efficient Fine-Tuning + + +
+ Parameter-efficient fine-tuning (PEFT) enables efficient adaptation of +pre-trained language models (PLMs) to specific tasks. By tuning only a minimal +set of (extra) parameters, PEFT achieves performance comparable to full +fine-tuning. However, despite its prevalent use, the security implications of +PEFT remain largely unexplored. In this paper, we conduct a pilot study +revealing that PEFT exhibits unique vulnerability to trojan attacks. +Specifically, we present PETA, a novel attack that accounts for downstream +adaptation through bilevel optimization: the upper-level objective embeds the +backdoor into a PLM while the lower-level objective simulates PEFT to retain +the PLM's task-specific performance. With extensive evaluation across a variety +of downstream tasks and trigger designs, we demonstrate PETA's effectiveness in +terms of both attack success rate and unaffected clean accuracy, even after the +victim user performs PEFT over the backdoored PLM using untainted data. +Moreover, we empirically provide possible explanations for PETA's efficacy: the +bilevel optimization inherently 'orthogonalizes' the backdoor and PEFT modules, +thereby retaining the backdoor throughout PEFT. Based on this insight, we +explore a simple defense that omits PEFT in selected layers of the backdoored +PLM and unfreezes a subset of these layers' parameters, which is shown to +effectively neutralize PETA. + +
+
+ comment: 20 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ An evaluation of GPT models for phenotype concept recognition + + +
+ Objective: Clinical deep phenotyping and phenotype annotation play a critical +role in both the diagnosis of patients with rare disorders as well as in +building computationally-tractable knowledge in the rare disorders field. These +processes rely on using ontology concepts, often from the Human Phenotype +Ontology, in conjunction with a phenotype concept recognition task (supported +usually by machine learning methods) to curate patient profiles or existing +scientific literature. With the significant shift in the use of large language +models (LLMs) for most NLP tasks, we examine the performance of the latest +Generative Pre-trained Transformer (GPT) models underpinning ChatGPT as a +foundation for the tasks of clinical phenotyping and phenotype annotation. +Materials and Methods: The experimental setup of the study included seven +prompts of various levels of specificity, two GPT models (gpt-3.5-turbo and +gpt-4.0) and two established gold standard corpora for phenotype recognition, +one consisting of publication abstracts and the other clinical observations. +Results: Our results show that, with an appropriate setup, these models can +achieve state of the art performance. The best run, using few-shot learning, +achieved 0.58 macro F1 score on publication abstracts and 0.75 macro F1 score +on clinical observations, the former being comparable with the state of the +art, while the latter surpassing the current best in class tool. Conclusion: +While the results are promising, the non-deterministic nature of the outcomes, +the high cost and the lack of concordance between different runs using the same +prompt and input make the use of these LLMs challenging for this particular +task. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 74 + +
+
+
+ + ☆ Robust and Interpretable COVID-19 Diagnosis on Chest X-ray Images using + Adversarial Training + + +
+ The novel 2019 Coronavirus disease (COVID-19) global pandemic is a defining +health crisis. Recent efforts have been increasingly directed towards achieving +quick and accurate detection of COVID-19 across symptomatic patients to +mitigate the intensity and spread of the disease. Artificial intelligence (AI) +algorithms applied to chest X-ray (CXR) images have emerged as promising +diagnostic tools, and previous work has demonstrated impressive classification +performances. However, such methods have faced criticisms from physicians due +to their black-box reasoning process and unpredictable nature. In contrast to +professional radiologist diagnosis, AI systems often lack generalizability, +explainability, and robustness in the clinical decision making process. In our +work, we address these issues by first proposing an extensive baseline study, +training and evaluating 21 convolutional neural network (CNN) models on a +diverse set of 33,000+ CXR images to classify between healthy, COVID-19, and +non-COVID-19 pneumonia CXRs. Our resulting models achieved a 3-way +classification accuracy, recall, and precision of up to 97.03\%, 97.97\%, and +99.95\%, respectively. Next, we investigate the effectiveness of adversarial +training on model robustness and explainability via Gradient-weighted Class +Activation Mapping (Grad-CAM) heatmaps. We find that adversarially trained +models not only significantly outperform their standard counterparts on +classifying perturbed images, but also yield saliency maps that 1) better +specify clinically relevant features, 2) are robust against extraneous +artifacts, and 3) agree considerably more with expert radiologist findings. + +
+
+
+
+
+ + ☆ A New Benchmark and Model for Challenging Image Manipulation Detection + + +
+ The ability to detect manipulation in multimedia data is vital in digital +forensics. Existing Image Manipulation Detection (IMD) methods are mainly based +on detecting anomalous features arisen from image editing or double compression +artifacts. All existing IMD techniques encounter challenges when it comes to +detecting small tampered regions from a large image. Moreover, +compression-based IMD approaches face difficulties in cases of double +compression of identical quality factors. To investigate the State-of-The-Art +(SoTA) IMD methods in those challenging conditions, we introduce a new +Challenging Image Manipulation Detection (CIMD) benchmark dataset, which +consists of two subsets, for evaluating editing-based and compression-based IMD +methods, respectively. The dataset images were manually taken and tampered with +high-quality annotations. In addition, we propose a new two-branch network +model based on HRNet that can better detect both the image-editing and +compression artifacts in those challenging conditions. Extensive experiments on +the CIMD benchmark show that our model significantly outperforms SoTA IMD +methods on CIMD. + +
+
+ comment: 8 pages, 6 figures, 3 tabels +
+
+
+
+
+ + ☆ ECRF: Entropy-Constrained Neural Radiance Fields Compression with + Frequency Domain Optimization + + +
+ Explicit feature-grid based NeRF models have shown promising results in terms +of rendering quality and significant speed-up in training. However, these +methods often require a significant amount of data to represent a single scene +or object. In this work, we present a compression model that aims to minimize +the entropy in the frequency domain in order to effectively reduce the data +size. First, we propose using the discrete cosine transform (DCT) on the +tensorial radiance fields to compress the feature-grid. This feature-grid is +transformed into coefficients, which are then quantized and entropy encoded, +following a similar approach to the traditional video coding pipeline. +Furthermore, to achieve a higher level of sparsity, we propose using an entropy +parameterization technique for the frequency domain, specifically for DCT +coefficients of the feature-grid. Since the transformed coefficients are +optimized during the training phase, the proposed model does not require any +fine-tuning or additional information. Our model only requires a lightweight +compression pipeline for encoding and decoding, making it easier to apply +volumetric radiance field methods for real-world applications. Experimental +results demonstrate that our proposed frequency domain entropy model can +achieve superior compression performance across various datasets. The source +code will be made publicly available. + +
+
+ comment: 10 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ A Systematic Review of Deep Learning-based Research on Radiology Report + Generation + + +
+ Radiology report generation (RRG) aims to automatically generate free-text +descriptions from clinical radiographs, e.g., chest X-Ray images. RRG plays an +essential role in promoting clinical automation and presents significant help +to provide practical assistance for inexperienced doctors and alleviate +radiologists' workloads. Therefore, consider these meaningful potentials, +research on RRG is experiencing explosive growth in the past half-decade, +especially with the rapid development of deep learning approaches. Existing +studies perform RRG from the perspective of enhancing different modalities, +provide insights on optimizing the report generation process with elaborated +features from both visual and textual information, and further facilitate RRG +with the cross-modal interactions among them. In this paper, we present a +comprehensive review of deep learning-based RRG from various perspectives. +Specifically, we firstly cover pivotal RRG approaches based on the +task-specific features of radiographs, reports, and the cross-modal relations +between them, and then illustrate the benchmark datasets conventionally used +for this task with evaluation metrics, subsequently analyze the performance of +different approaches and finally offer our summary on the challenges and the +trends in future directions. Overall, the goal of this paper is to serve as a +tool for understanding existing literature and inspiring potential valuable +research in the field of RRG. + +
+
+ comment: 26 pages, 6 figures +
+
+
+
+
+ + ☆ Enhancing mTBI Diagnosis with Residual Triplet Convolutional Neural + Network Using 3D CT + + +
+ Mild Traumatic Brain Injury (mTBI) is a common and challenging condition to +diagnose accurately. Timely and precise diagnosis is essential for effective +treatment and improved patient outcomes. Traditional diagnostic methods for +mTBI often have limitations in terms of accuracy and sensitivity. In this +study, we introduce an innovative approach to enhance mTBI diagnosis using 3D +Computed Tomography (CT) images and a metric learning technique trained with +triplet loss. To address these challenges, we propose a Residual Triplet +Convolutional Neural Network (RTCNN) model to distinguish between mTBI cases +and healthy ones by embedding 3D CT scans into a feature space. The triplet +loss function maximizes the margin between similar and dissimilar image pairs, +optimizing feature representations. This facilitates better context placement +of individual cases, aids informed decision-making, and has the potential to +improve patient outcomes. Our RTCNN model shows promising performance in mTBI +diagnosis, achieving an average accuracy of 94.3%, a sensitivity of 94.1%, and +a specificity of 95.2%, as confirmed through a five-fold cross-validation. +Importantly, when compared to the conventional Residual Convolutional Neural +Network (RCNN) model, the RTCNN exhibits a significant improvement, showcasing +a remarkable 22.5% increase in specificity, a notable 16.2% boost in accuracy, +and an 11.3% enhancement in sensitivity. Moreover, RTCNN requires lower memory +resources, making it not only highly effective but also resource-efficient in +minimizing false positives while maximizing its diagnostic accuracy in +distinguishing normal CT scans from mTBI cases. The quantitative performance +metrics provided and utilization of occlusion sensitivity maps to visually +explain the model's decision-making process further enhance the +interpretability and transparency of our approach. + +
+
+
+
+
+ + ☆ HACD: Hand-Aware Conditional Diffusion for Monocular Hand-Held Object + Reconstruction + + +
+ Reconstructing hand-held objects from a single RGB image without known 3D +object templates, category prior, or depth information is a vital yet +challenging problem in computer vision. In contrast to prior works that utilize +deterministic modeling paradigms, which make it hard to account for the +uncertainties introduced by hand- and self-occlusion, we employ a probabilistic +point cloud denoising diffusion model to tackle the above challenge. In this +work, we present Hand-Aware Conditional Diffusion for monocular hand-held +object reconstruction (HACD), modeling the hand-object interaction in two +aspects. First, we introduce hand-aware conditioning to model hand-object +interaction from both semantic and geometric perspectives. Specifically, a +unified hand-object semantic embedding compensates for the 2D local feature +deficiency induced by hand occlusion, and a hand articulation embedding further +encodes the relationship between object vertices and hand joints. Second, we +propose a hand-constrained centroid fixing scheme, which utilizes hand vertices +priors to restrict the centroid deviation of partially denoised point cloud +during diffusion and reverse process. Removing the centroid bias interference +allows the diffusion models to focus on the reconstruction of shape, thus +enhancing the stability and precision of local feature projection. Experiments +on the synthetic ObMan dataset and two real-world datasets, HO3D and MOW, +demonstrate our approach surpasses all existing methods by a large margin. + +
+
+
+
+
+ + ☆ TCuPGAN: A novel framework developed for optimizing human-machine + interactions in citizen science ECML + + +
+ In the era of big data in scientific research, there is a necessity to +leverage techniques which reduce human effort in labeling and categorizing +large datasets by involving sophisticated machine tools. To combat this +problem, we present a novel, general purpose model for 3D segmentation that +leverages patch-wise adversariality and Long Short-Term Memory to encode +sequential information. Using this model alongside citizen science projects +which use 3D datasets (image cubes) on the Zooniverse platforms, we propose an +iterative human-machine optimization framework where only a fraction of the 2D +slices from these cubes are seen by the volunteers. We leverage the patch-wise +discriminator in our model to provide an estimate of which slices within these +image cubes have poorly generalized feature representations, and +correspondingly poor machine performance. These images with corresponding +machine proposals would be presented to volunteers on Zooniverse for +correction, leading to a drastic reduction in the volunteer effort on citizen +science projects. We trained our model on ~2300 liver tissue 3D electron +micrographs. Lipid droplets were segmented within these images through human +annotation via the `Etch A Cell - Fat Checker' citizen science project, hosted +on the Zooniverse platform. In this work, we demonstrate this framework and the +selection methodology which resulted in a measured reduction in volunteer +effort by more than 60%. We envision this type of joint human-machine +partnership will be of great use on future Zooniverse projects. + +
+
+ comment: 5 pages, 1 figure, accepted for publication at HLDM '23 (ECML PKDD + 2023 workshop) +
+
+
+
+
+ + ☆ Appearance-based gaze estimation enhanced with synthetic images using + deep neural networks + + +
+ Human eye gaze estimation is an important cognitive ingredient for successful +human-robot interaction, enabling the robot to read and predict human behavior. +We approach this problem using artificial neural networks and build a modular +system estimating gaze from separately cropped eyes, taking advantage of +existing well-functioning components for face detection (RetinaFace) and head +pose estimation (6DRepNet). Our proposed method does not require any special +hardware or infrared filters but uses a standard notebook-builtin RGB camera, +as often approached with appearance-based methods. Using the MetaHuman tool, we +also generated a large synthetic dataset of more than 57,000 human faces and +made it publicly available. The inclusion of this dataset (with eye gaze and +head pose information) on top of the standard Columbia Gaze dataset into +training the model led to better accuracy with a mean average error below two +degrees in eye pitch and yaw directions, which compares favourably to related +methods. We also verified the feasibility of our model by its preliminary +testing in real-world setting using the builtin 4K camera in NICO semi-humanoid +robot's eye. + +
+
+ comment: 6 pages, 10 figures, accepted to 2023 IEEE Symposium Series on + Computational Intelligence +
+
+
+
+
+ + ☆ GigaPose: Fast and Robust Novel Object Pose Estimation via One + Correspondence + + +
+ We present GigaPose, a fast, robust, and accurate method for CAD-based novel +object pose estimation in RGB images. GigaPose first leverages discriminative +templates, rendered images of the CAD models, to recover the out-of-plane +rotation and then uses patch correspondences to estimate the four remaining +parameters. Our approach samples templates in only a two-degrees-of-freedom +space instead of the usual three and matches the input image to the templates +using fast nearest neighbor search in feature space, results in a speedup +factor of 38x compared to the state of the art. Moreover, GigaPose is +significantly more robust to segmentation errors. Our extensive evaluation on +the seven core datasets of the BOP challenge demonstrates that it achieves +state-of-the-art accuracy and can be seamlessly integrated with a refinement +method. Additionally, we show the potential of GigaPose with 3D models +predicted by recent work on 3D reconstruction from a single image, relaxing the +need for CAD models and making 6D pose object estimation much more convenient. +Our source code and trained models are publicly available at +https://github.com/nv-nguyen/gigaPose + +
+
+
+
+
+ + ☆ Automated 3D Tumor Segmentation using Temporal Cubic PatchGAN (TCuP-GAN) + + +
+ Development of robust general purpose 3D segmentation frameworks using the +latest deep learning techniques is one of the active topics in various +bio-medical domains. In this work, we introduce Temporal Cubic PatchGAN +(TCuP-GAN), a volume-to-volume translational model that marries the concepts of +a generative feature learning framework with Convolutional Long Short-Term +Memory Networks (LSTMs), for the task of 3D segmentation. We demonstrate the +capabilities of our TCuP-GAN on the data from four segmentation challenges +(Adult Glioma, Meningioma, Pediatric Tumors, and Sub-Saharan Africa subset) +featured within the 2023 Brain Tumor Segmentation (BraTS) Challenge and +quantify its performance using LesionWise Dice similarity and $95\%$ Hausdorff +Distance metrics. We demonstrate the successful learning of our framework to +predict robust multi-class segmentation masks across all the challenges. This +benchmarking work serves as a stepping stone for future efforts towards +applying TCuP-GAN on other multi-class tasks such as multi-organelle +segmentation in electron microscopy imaging. + +
+
+ comment: Submitted as a short paper to the proceedings of the 2023 Brain Tumor + Segmentation (BraTS) Challenge +
+
+
+
+
+ + ☆ Class Balanced Dynamic Acquisition for Domain Adaptive Semantic + Segmentation using Active Learning NeurIPS 2023 + + +
+ Domain adaptive active learning is leading the charge in label-efficient +training of neural networks. For semantic segmentation, state-of-the-art models +jointly use two criteria of uncertainty and diversity to select training +labels, combined with a pixel-wise acquisition strategy. However, we show that +such methods currently suffer from a class imbalance issue which degrades their +performance for larger active learning budgets. We then introduce Class +Balanced Dynamic Acquisition (CBDA), a novel active learning method that +mitigates this issue, especially in high-budget regimes. The more balanced +labels increase minority class performance, which in turn allows the model to +outperform the previous baseline by 0.6, 1.7, and 2.4 mIoU for budgets of 5%, +10%, and 20%, respectively. Additionally, the focus on minority classes leads +to improvements of the minimum class performance of 0.5, 2.9, and 4.6 IoU +respectively. The top-performing model even exceeds the fully supervised +baseline, showing that a more balanced label than the entire ground truth can +be beneficial. + +
+
+ comment: NeurIPS 2023 Workshop on Adaptive Experimental Design and Active + Learning in the Real World +
+
+
+
+
+ + ☆ ACT: Adversarial Consistency Models + + +
+ Though diffusion models excel in image generation, their step-by-step +denoising leads to slow generation speeds. Consistency training addresses this +issue with single-step sampling but often produces lower-quality generations +and requires high training costs. In this paper, we show that optimizing +consistency training loss minimizes the Wasserstein distance between target and +generated distributions. As timestep increases, the upper bound accumulates +previous consistency training losses. Therefore, larger batch sizes are needed +to reduce both current and accumulated losses. We propose Adversarial +Consistency Training (ACT), which directly minimizes the Jensen-Shannon (JS) +divergence between distributions at each timestep using a discriminator. +Theoretically, ACT enhances generation quality, and convergence. By +incorporating a discriminator into the consistency training framework, our +method achieves improved FID scores on CIFAR10 and ImageNet 64$\times$64, +retains zero-shot image inpainting capabilities, and uses less than $1/6$ of +the original batch size and fewer than $1/2$ of the model parameters and +training steps compared to the baseline method, this leads to a substantial +reduction in resource consumption. + +
+
+
+
+
+ + ☆ Video Anomaly Detection using GAN + + +
+ Accounting for the increased concern for public safety, automatic abnormal +event detection and recognition in a surveillance scene is crucial. It is a +current open study subject because of its intricacy and utility. The +identification of aberrant events automatically, it's a difficult undertaking +because everyone's idea of abnormality is different. A typical occurrence in +one circumstance could be seen as aberrant in another. Automatic anomaly +identification becomes particularly challenging in the surveillance footage +with a large crowd due to congestion and high occlusion. With the use of +machine learning techniques, this thesis study aims to offer the solution for +this use case so that human resources won't be required to keep an eye out for +any unusual activity in the surveillance system records. We have developed a +novel generative adversarial network (GAN) based anomaly detection model. This +model is trained such that it learns together about constructing a high +dimensional picture space and determining the latent space from the video's +context. The generator uses a residual Autoencoder architecture made up of a +multi-stage channel attention-based decoder and a two-stream, deep +convolutional encoder that can realise both spatial and temporal data. We have +also offered a technique for refining the GAN model that reduces training time +while also generalising the model by utilising transfer learning between +datasets. Using a variety of assessment measures, we compare our model to the +current state-of-the-art techniques on four benchmark datasets. The empirical +findings indicate that, in comparison to existing techniques, our network +performs favourably on all datasets. + +
+
+
+
+
+ + ☆ Class Uncertainty: A Measure to Mitigate Class Imbalance + + +
+ Class-wise characteristics of training examples affect the performance of +deep classifiers. A well-studied example is when the number of training +examples of classes follows a long-tailed distribution, a situation that is +likely to yield sub-optimal performance for under-represented classes. This +class imbalance problem is conventionally addressed by approaches relying on +the class-wise cardinality of training examples, such as data resampling. In +this paper, we demonstrate that considering solely the cardinality of classes +does not cover all issues causing class imbalance. To measure class imbalance, +we propose "Class Uncertainty" as the average predictive uncertainty of the +training examples, and we show that this novel measure captures the differences +across classes better than cardinality. We also curate SVCI-20 as a novel +dataset in which the classes have equal number of training examples but they +differ in terms of their hardness; thereby causing a type of class imbalance +which cannot be addressed by the approaches relying on cardinality. We +incorporate our "Class Uncertainty" measure into a diverse set of ten class +imbalance mitigation methods to demonstrate its effectiveness on long-tailed +datasets as well as on our SVCI-20. Code and datasets will be made available. + +
+
+
+
+
+ + ☆ Brain MRI Screening Tool with Federated Learning + + +
+ In clinical practice, we often see significant delays between MRI scans and +the diagnosis made by radiologists, even for severe cases. In some cases, this +may be caused by the lack of additional information and clues, so even the +severe cases need to wait in the queue for diagnosis. This can be avoided if +there is an automatic software tool, which would supplement additional +information, alerting radiologists that the particular patient may be a severe +case. + We are presenting an automatic brain MRI Screening Tool and we are +demonstrating its capabilities for detecting tumor-like pathologies. It is the +first version on the path toward a robust multi-pathology screening solution. +The tool supports Federated Learning, so multiple institutions may contribute +to the model without disclosing their private data. + +
+
+ comment: 5 pages, 2 figures. Submitted to ISBI 2024 conference +
+
+
+
+
+ + ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon has elevated the issue of source bias in text retrieval for web +searches. Specifically, neural retrieval models tend to rank generated texts +higher than human-written texts. In this paper, we extend the study of this +bias to cross-modal retrieval. Firstly, we successfully construct a suitable +benchmark to explore the existence of the bias. Subsequent extensive +experiments on this benchmark reveal that AI-generated images introduce an +invisible relevance bias to text-image retrieval models. Specifically, our +experiments show that text-image retrieval models tend to rank the AI-generated +images higher than the real images, even though the AI-generated images do not +exhibit more visually relevant features to the query than real images. This +invisible relevance bias is prevalent across retrieval models with varying +training data and architectures. Furthermore, our subsequent exploration +reveals that the inclusion of AI-generated images in the training data of the +retrieval models exacerbates the invisible relevance bias. The above phenomenon +triggers a vicious cycle, which makes the invisible relevance bias become more +and more serious. To elucidate the potential causes of invisible relevance and +address the aforementioned issues, we introduce an effective training method +aimed at alleviating the invisible relevance bias. Subsequently, we apply our +proposed debiasing method to retroactively identify the causes of invisible +relevance, revealing that the AI-generated images induce the image encoder to +embed additional information into their representation. This information +exhibits a certain consistency across generated images with different semantics +and can make the retriever estimate a higher relevance score. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ You Only Explain Once + + +
+ In this paper, we propose a new black-box explainability algorithm and tool, +YO-ReX, for efficient explanation of the outputs of object detectors. The new +algorithm computes explanations for all objects detected in the image +simultaneously. Hence, compared to the baseline, the new algorithm reduces the +number of queries by a factor of 10X for the case of ten detected objects. The +speedup increases further with with the number of objects. Our experimental +results demonstrate that YO-ReX can explain the outputs of YOLO with a +negligible overhead over the running time of YOLO. We also demonstrate similar +results for explaining SSD and Faster R-CNN. The speedup is achieved by +avoiding backtracking by combining aggressive pruning with a causal analysis. + +
+
+
+
+
+ + ☆ Learning Saliency From Fixations + + +
+ We present a novel approach for saliency prediction in images, leveraging +parallel decoding in transformers to learn saliency solely from fixation maps. +Models typically rely on continuous saliency maps, to overcome the difficulty +of optimizing for the discrete fixation map. We attempt to replicate the +experimental setup that generates saliency datasets. Our approach treats +saliency prediction as a direct set prediction problem, via a global loss that +enforces unique fixations prediction through bipartite matching and a +transformer encoder-decoder architecture. By utilizing a fixed set of learned +fixation queries, the cross-attention reasons over the image features to +directly output the fixation points, distinguishing it from other modern +saliency predictors. Our approach, named Saliency TRansformer (SalTR), achieves +metric scores on par with state-of-the-art approaches on the Salicon and MIT300 +benchmarks. + +
+
+
+
+
+ + ☆ HGCLIP: Exploring Vision-Language Models with Graph Representations for + Hierarchical Understanding + + +
+ Object categories are typically organized into a multi-granularity taxonomic +hierarchy. When classifying categories at different hierarchy levels, +traditional uni-modal approaches focus primarily on image features, revealing +limitations in complex scenarios. Recent studies integrating Vision-Language +Models (VLMs) with class hierarchies have shown promise, yet they fall short of +fully exploiting the hierarchical relationships. These efforts are constrained +by their inability to perform effectively across varied granularity of +categories. To tackle this issue, we propose a novel framework (HGCLIP) that +effectively combines CLIP with a deeper exploitation of the Hierarchical class +structure via Graph representation learning. We explore constructing the class +hierarchy into a graph, with its nodes representing the textual or image +features of each category. After passing through a graph encoder, the textual +features incorporate hierarchical structure information, while the image +features emphasize class-aware features derived from prototypes through the +attention mechanism. Our approach demonstrates significant improvements on both +generic and fine-grained visual recognition benchmarks. Our codes are fully +available at https://github.com/richard-peng-xia/HGCLIP. + +
+
+
+
+
+ + ☆ Do VSR Models Generalize Beyond LRS3? + + +
+ The Lip Reading Sentences-3 (LRS3) benchmark has primarily been the focus of +intense research in visual speech recognition (VSR) during the last few years. +As a result, there is an increased risk of overfitting to its excessively used +test set, which is only one hour duration. To alleviate this issue, we build a +new VSR test set named WildVSR, by closely following the LRS3 dataset creation +processes. We then evaluate and analyse the extent to which the current VSR +models generalize to the new test data. We evaluate a broad range of publicly +available VSR models and find significant drops in performance on our test set, +compared to their corresponding LRS3 results. Our results suggest that the +increase in word error rates is caused by the models inability to generalize to +slightly harder and in the wild lip sequences than those found in the LRS3 test +set. Our new test benchmark is made public in order to enable future research +towards more robust VSR models. + +
+
+
+
+
+ + ☆ Hardware Resilience Properties of Text-Guided Image Classifiers NeurIPS 2023 + + +
+ This paper presents a novel method to enhance the reliability of image +classification models during deployment in the face of transient hardware +errors. By utilizing enriched text embeddings derived from GPT-3 with question +prompts per class and CLIP pretrained text encoder, we investigate their impact +as an initialization for the classification layer. Our approach achieves a +remarkable $5.5\times$ average increase in hardware reliability (and up to 14x) +across various architectures in the most critical layer, with minimal accuracy +drop (0.3% on average) compared to baseline PyTorch models. Furthermore, our +method seamlessly integrates with any image classification backbone, showcases +results across various network architectures, decreases parameter and FLOPs +overhead, and follows a consistent training recipe. This research offers a +practical and efficient solution to bolster the robustness of image +classification models against hardware failures, with potential implications +for future studies in this domain. Our code and models are released at +https://github.com/TalalWasim/TextGuidedResilience. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ☆ Assessment of Deep Learning Segmentation for Real-Time Free-Breathing + Cardiac Magnetic Resonance Imaging + + +
+ In recent years, a variety of deep learning networks for cardiac MRI (CMR) +segmentation have been developed and analyzed. However, nearly all of them are +focused on cine CMR under breathold. In this work, accuracy of deep learning +methods is assessed for volumetric analysis (via segmentation) of the left +ventricle in real-time free-breathing CMR at rest and under exercise stress. +Data from healthy volunteers (n=15) for cine and real-time free-breathing CMR +were analyzed retrospectively. Segmentations of a commercial software (comDL) +and a freely available neural network (nnU-Net), were compared to a reference +created via the manual correction of comDL segmentation. Segmentation of left +ventricular endocardium (LV), left ventricular myocardium (MYO), and right +ventricle (RV) is evaluated for both end-systolic and end-diastolic phases and +analyzed with Dice's coefficient (DC). The volumetric analysis includes LV +end-diastolic volume (EDV), LV end-systolic volume (ESV), and LV ejection +fraction (EF). For cine CMR, nnU-Net and comDL achieve a DC above 0.95 for LV +and 0.9 for MYO, and RV. For real-time CMR, the accuracy of nnU-Net exceeds +that of comDL overall. For real-time CMR at rest, nnU-Net achieves a DC of 0.94 +for LV, 0.89 for MYO, and 0.90 for RV; mean absolute differences between +nnU-Net and reference are 2.9mL for EDV, 3.5mL for ESV and 2.6% for EF. For +real-time CMR under exercise stress, nnU-Net achieves a DC of 0.92 for LV, 0.85 +for MYO, and 0.83 for RV; mean absolute differences between nnU-Net and +reference are 11.4mL for EDV, 2.9mL for ESV and 3.6% for EF. Deep learning +methods designed or trained for cine CMR segmentation can perform well on +real-time CMR. For real-time free-breathing CMR at rest, the performance of +deep learning methods is comparable to inter-observer variability in cine CMR +and is usable or fully automatic segmentation. + +
+
+ comment: *These authors contributed equally to this work +
+
+
+
+
+ + ☆ Understanding the Vulnerability of CLIP to Image Compression NeurIPS 2023 + + +
+ CLIP is a widely used foundational vision-language model that is used for +zero-shot image recognition and other image-text alignment tasks. We +demonstrate that CLIP is vulnerable to change in image quality under +compression. This surprising result is further analysed using an attribution +method-Integrated Gradients. Using this attribution method, we are able to +better understand both quantitatively and qualitatively exactly the nature in +which the compression affects the zero-shot recognition accuracy of this model. +We evaluate this extensively on CIFAR-10 and STL-10. Our work provides the +basis to understand this vulnerability of CLIP and can help us develop more +effective methods to improve the robustness of CLIP and other vision-language +models. + +
+
+ comment: R0-FoMo: Workshop on Robustness of Few-shot and Zero-shot Learning in + Foundation Models at NeurIPS 2023 +
+
+
+
+
+ + ☆ Continual Learning of Diffusion Models with Generative Distillation + + +
+ Diffusion models are powerful generative models that achieve state-of-the-art +performance in tasks such as image synthesis. However, training them demands +substantial amounts of data and computational resources. Continual learning +would allow for incrementally learning new tasks and accumulating knowledge, +thus reusing already trained models would be possible. One potentially suitable +approach is generative replay, where a copy of a generative model trained on +previous tasks produces synthetic data that are interleaved with data from the +current task. However, standard generative replay applied to diffusion models +results in a catastrophic loss in denoising capabilities. In this paper, we +propose generative distillation, an approach that distils the entire reverse +process of a diffusion model. We demonstrate that our approach significantly +improves the continual learning performance of generative replay with only a +moderate increase in the computational costs. + +
+
+
+
+
+ + ☆ Creating and Benchmarking a Synthetic Dataset for Cloud Optical + Thickness Estimation + + +
+ Cloud formations often obscure optical satellite-based monitoring of the +Earth's surface, thus limiting Earth observation (EO) activities such as land +cover mapping, ocean color analysis, and cropland monitoring. The integration +of machine learning (ML) methods within the remote sensing domain has +significantly improved performance on a wide range of EO tasks, including cloud +detection and filtering, but there is still much room for improvement. A key +bottleneck is that ML methods typically depend on large amounts of annotated +data for training, which is often difficult to come by in EO contexts. This is +especially true for the task of cloud optical thickness (COT) estimation. A +reliable estimation of COT enables more fine-grained and application-dependent +control compared to using pre-specified cloud categories, as is commonly done +in practice. To alleviate the COT data scarcity problem, in this work we +propose a novel synthetic dataset for COT estimation, where top-of-atmosphere +radiances have been simulated for 12 of the spectral bands of the +Multi-Spectral Instrument (MSI) sensor onboard Sentinel-2 platforms. These data +points have been simulated under consideration of different cloud types, COTs, +and ground surface and atmospheric profiles. Extensive experimentation of +training several ML models to predict COT from the measured reflectivity of the +spectral bands demonstrates the usefulness of our proposed dataset. +Generalization to real data is also demonstrated on two satellite image +datasets -- one that is publicly available, and one which we have collected and +annotated. The synthetic data, the newly collected real dataset, code and +models have been made publicly available at +https://github.com/aleksispi/ml-cloud-opt-thick. + +
+
+ comment: Code, data and models available at + https://github.com/aleksispi/ml-cloud-opt-thick +
+
+
+
+
+ + ☆ Shadow: A Novel Loss Function for Efficient Training in Siamese Networks + + +
+ Despite significant recent advances in similarity detection tasks, existing +approaches pose substantial challenges under memory constraints. One of the +primary reasons for this is the use of computationally expensive metric +learning loss functions such as Triplet Loss in Siamese networks. In this +paper, we present a novel loss function called Shadow Loss that compresses the +dimensions of an embedding space during loss calculation without loss of +performance. The distance between the projections of the embeddings is learned +from inputs on a compact projection space where distances directly correspond +to a measure of class similarity. Projecting on a lower-dimension projection +space, our loss function converges faster, and the resulting classified image +clusters have higher inter-class and smaller intra-class distances. Shadow Loss +not only reduces embedding dimensions favoring memory constraint devices but +also consistently performs better than the state-of-the-art Triplet Margin Loss +by an accuracy of 5\%-10\% across diverse datasets. The proposed loss function +is also model agnostic, upholding its performance across several tested models. +Its effectiveness and robustness across balanced, imbalanced, medical, and +non-medical image datasets suggests that it is not specific to a particular +model or dataset but demonstrates superior performance consistently while using +less memory and computation. + +
+
+
+
+
+ + ☆ High-resolution Population Maps Derived from Sentinel-1 and Sentinel-2 + + +
+ Detailed population maps play an important role in diverse fields ranging +from humanitarian action to urban planning. Generating such maps in a timely +and scalable manner presents a challenge, especially in data-scarce regions. To +address it we have developed POPCORN, a population mapping method whose only +inputs are free, globally available satellite images from Sentinel-1 and +Sentinel-2; and a small number of aggregate population counts over coarse +census districts for calibration. Despite the minimal data requirements our +approach surpasses the mapping accuracy of existing schemes, including several +that rely on building footprints derived from high-resolution imagery. E.g., we +were able to produce population maps for Rwanda with 100m GSD based on less +than 400 regional census counts. In Kigali, those maps reach an $R^2$ score of +66% w.r.t. a ground truth reference map, with an average error of only $\pm$10 +inhabitants/ha. Conveniently, POPCORN retrieves explicit maps of built-up areas +and of local building occupancy rates, making the mapping process interpretable +and offering additional insights, for instance about the distribution of +built-up, but unpopulated areas, e.g., industrial warehouses. Moreover, we find +that, once trained, the model can be applied repeatedly to track population +changes; and that it can be transferred to geographically similar regions, +e.g., from Uganda to Rwanda). With our work we aim to democratize access to +up-to-date and high-resolution population maps, recognizing that some regions +faced with particularly strong population dynamics may lack the resources for +costly micro-census campaigns. + +
+
+ comment: 17 pages, 10 tables, 7 Figures +
+
+
+
+
+ + ☆ GRJointNET: Synergistic Completion and Part Segmentation on 3D + Incomplete Point Clouds + + +
+ Segmentation of three-dimensional (3D) point clouds is an important task for +autonomous systems. However, success of segmentation algorithms depends greatly +on the quality of the underlying point clouds (resolution, completeness etc.). +In particular, incomplete point clouds might reduce a downstream model's +performance. GRNet is proposed as a novel and recent deep learning solution to +complete point clouds, but it is not capable of part segmentation. On the other +hand, our proposed solution, GRJointNet, is an architecture that can perform +joint completion and segmentation on point clouds as a successor of GRNet. +Features extracted for the two tasks are also utilized by each other to +increase the overall performance. We evaluated our proposed network on the +ShapeNet-Part dataset and compared its performance to GRNet. Our results +demonstrate GRJointNet can outperform GRNet on point completion. It should also +be noted that GRNet is not capable of segmentation while GRJointNet is. This +study1, therefore, holds a promise to enhance practicality and utility of point +clouds in 3D vision for autonomous systems. + +
+
+
+
+
+ + ☆ EIGEN: Expert-Informed Joint Learning Aggregation for High-Fidelity + Information Extraction from Document Images + + +
+ Information Extraction (IE) from document images is challenging due to the +high variability of layout formats. Deep models such as LayoutLM and BROS have +been proposed to address this problem and have shown promising results. +However, they still require a large amount of field-level annotations for +training these models. Other approaches using rule-based methods have also been +proposed based on the understanding of the layout and semantics of a form such +as geometric position, or type of the fields, etc. In this work, we propose a +novel approach, EIGEN (Expert-Informed Joint Learning aGgrEatioN), which +combines rule-based methods with deep learning models using data programming +approaches to circumvent the requirement of annotation of large amounts of +training data. Specifically, EIGEN consolidates weak labels induced from +multiple heuristics through generative models and use them along with a small +number of annotated labels to jointly train a deep model. In our framework, we +propose the use of labeling functions that include incorporating contextual +information thus capturing the visual and language context of a word for +accurate categorization. We empirically show that our EIGEN framework can +significantly improve the performance of state-of-the-art deep models with the +availability of very few labeled data instances. The source code is available +at +https://github.com/ayushayush591/EIGEN-High-Fidelity-Extraction-Document-Images. + +
+
+ comment: In Proceedings of ML for Health Conference, 2023 (co-located with + Neurips) +
+
+
+
+
+ + ☆ FViT-Grasp: Grasping Objects With Using Fast Vision Transformers + + +
+ This study addresses the challenge of manipulation, a prominent issue in +robotics. We have devised a novel methodology for swiftly and precisely +identifying the optimal grasp point for a robot to manipulate an object. Our +approach leverages a Fast Vision Transformer (FViT), a type of neural network +designed for processing visual data and predicting the most suitable grasp +location. Demonstrating state-of-the-art performance in terms of speed while +maintaining a high level of accuracy, our method holds promise for potential +deployment in real-time robotic grasping applications. We believe that this +study provides a baseline for future research in vision-based robotic grasp +applications. Its high speed and accuracy bring researchers closer to real-life +applications. + +
+
+
+
+
+ + ☆ Low Latency Instance Segmentation by Continuous Clustering for Rotating + LiDAR Sensors + + +
+ Low-latency instance segmentation of LiDAR point clouds is crucial in +real-world applications because it serves as an initial and frequently-used +building block in a robot's perception pipeline, where every task adds further +delay. Particularly in dynamic environments, this total delay can result in +significant positional offsets of dynamic objects, as seen in highway +scenarios. To address this issue, we employ continuous clustering of obstacle +points in order to obtain an instance-segmented point cloud. Unlike most +existing approaches, which use a full revolution of the LiDAR sensor, we +process the data stream in a continuous and seamless fashion. More +specifically, each column of a range image is processed as soon it is +available. Obstacle points are clustered to existing instances in real-time and +it is checked at a high-frequency which instances are completed and are ready +to be published. An additional advantage is that no problematic discontinuities +between the points of the start and the end of a scan are observed. In this +work we describe the two-layered data structure and the corresponding algorithm +for continuous clustering, which is able to cluster the incoming data in real +time. We explain the importance of a large perceptive field of view. +Furthermore, we describe and evaluate important architectural design choices, +which could be relevant to design an architecture for deep learning based +low-latency instance segmentation. We are publishing the source code at +https://github.com/UniBwTAS/continuous_clustering. + +
+
+ comment: Accompanying Video: https://www.youtube.com/watch?v=DZKuAQBngNE +
+
+
+
+
+ + ☆ Deep Interactive Segmentation of Medical Images: A Systematic Review and + Taxonomy + + +
+ Interactive segmentation is a crucial research area in medical image analysis +aiming to boost the efficiency of costly annotations by incorporating human +feedback. This feedback takes the form of clicks, scribbles, or masks and +allows for iterative refinement of the model output so as to efficiently guide +the system towards the desired behavior. In recent years, deep learning-based +approaches have propelled results to a new level causing a rapid growth in the +field with 121 methods proposed in the medical imaging domain alone. In this +review, we provide a structured overview of this emerging field featuring a +comprehensive taxonomy, a systematic review of existing methods, and an +in-depth analysis of current practices. Based on these contributions, we +discuss the challenges and opportunities in the field. For instance, we find +that there is a severe lack of comparison across methods which needs to be +tackled by standardized baselines and benchmarks. + +
+
+ comment: 26 pages, 8 figures, 10 tables; Zdravko Marinov and Paul F. J\"ager + and co-first authors; This work has been submitted to the IEEE for possible + publication. Copyright may be transferred without notice, after which this + version may no longer be accessible +
+
+
+
+
+ + ☆ Investigating the use of publicly available natural videos to learn + Dynamic MR image reconstruction + + +
+ Purpose: To develop and assess a deep learning (DL) pipeline to learn dynamic +MR image reconstruction from publicly available natural videos (Inter4K). + Materials and Methods: Learning was performed for a range of DL architectures +(VarNet, 3D UNet, FastDVDNet) and corresponding sampling patterns (Cartesian, +radial, spiral) either from true multi-coil cardiac MR data (N=692) or from +pseudo-MR data simulated from Inter4K natural videos (N=692). Real-time +undersampled dynamic MR images were reconstructed using DL networks trained +with cardiac data and natural videos, and compressed sensing (CS). Differences +were assessed in simulations (N=104 datasets) in terms of MSE, PSNR, and SSIM +and prospectively for cardiac (short axis, four chambers, N=20) and speech +(N=10) data in terms of subjective image quality ranking, SNR and Edge +sharpness. Friedman Chi Square tests with post-hoc Nemenyi analysis were +performed to assess statistical significance. + Results: For all simulation metrics, DL networks trained with cardiac data +outperformed DL networks trained with natural videos, which outperformed CS +(p<0.05). However, in prospective experiments DL reconstructions using both +training datasets were ranked similarly (and higher than CS) and presented no +statistical differences in SNR and Edge Sharpness for most conditions. +Additionally, high SSIM was measured between the DL methods with cardiac data +and natural videos (SSIM>0.85). + Conclusion: The developed pipeline enabled learning dynamic MR reconstruction +from natural videos preserving DL reconstruction advantages such as high +quality fast and ultra-fast reconstructions while overcoming some limitations +(data scarcity or sharing). The natural video dataset, code and pre-trained +networks are made readily available on github. + Key Words: real-time; dynamic MRI; deep learning; image reconstruction; +machine learning; + +
+
+
+
+
+ + ☆ RankFeat\&RankWeight: Rank-1 Feature/Weight Removal for + Out-of-distribution Detection + + +
+ The task of out-of-distribution (OOD) detection is crucial for deploying +machine learning models in real-world settings. In this paper, we observe that +the singular value distributions of the in-distribution (ID) and OOD features +are quite different: the OOD feature matrix tends to have a larger dominant +singular value than the ID feature, and the class predictions of OOD samples +are largely determined by it. This observation motivates us to propose +\texttt{RankFeat}, a simple yet effective \emph{post hoc} approach for OOD +detection by removing the rank-1 matrix composed of the largest singular value +and the associated singular vectors from the high-level feature. +\texttt{RankFeat} achieves \emph{state-of-the-art} performance and reduces the +average false positive rate (FPR95) by 17.90\% compared with the previous best +method. The success of \texttt{RankFeat} motivates us to investigate whether a +similar phenomenon would exist in the parameter matrices of neural networks. We +thus propose \texttt{RankWeight} which removes the rank-1 weight from the +parameter matrices of a single deep layer. Our \texttt{RankWeight}is also +\emph{post hoc} and only requires computing the rank-1 matrix once. As a +standalone approach, \texttt{RankWeight} has very competitive performance +against other methods across various backbones. Moreover, \texttt{RankWeight} +enjoys flexible compatibility with a wide range of OOD detection methods. The +combination of \texttt{RankWeight} and \texttt{RankFeat} refreshes the new +\emph{state-of-the-art} performance, achieving the FPR95 as low as 16.13\% on +the ImageNet-1k benchmark. Extensive ablation studies and comprehensive +theoretical analyses are presented to support the empirical results. + +
+
+ comment: submitted to T-PAMI +
+
+
+
+
+ + ☆ High-Order Tensor Recovery with A Tensor $U_1$ Norm + + +
+ Recently, numerous tensor SVD (t-SVD)-based tensor recovery methods have +emerged, showing promise in processing visual data. However, these methods +often suffer from performance degradation when confronted with high-order +tensor data exhibiting non-smooth changes, commonly observed in real-world +scenarios but ignored by the traditional t-SVD-based methods. Our objective in +this study is to provide an effective tensor recovery technique for handling +non-smooth changes in tensor data and efficiently explore the correlations of +high-order tensor data across its various dimensions without introducing +numerous variables and weights. To this end, we introduce a new tensor +decomposition and a new tensor norm called the Tensor $U_1$ norm. We utilize +these novel techniques in solving the problem of high-order tensor completion +problem and provide theoretical guarantees for the exact recovery of the +resulting tensor completion models. An optimization algorithm is proposed to +solve the resulting tensor completion model iteratively by combining the +proximal algorithm with the Alternating Direction Method of Multipliers. +Theoretical analysis showed the convergence of the algorithm to the +Karush-Kuhn-Tucker (KKT) point of the optimization problem. Numerical +experiments demonstrated the effectiveness of the proposed method in high-order +tensor completion, especially for tensor data with non-smooth changes. + +
+
+
+
+
+ + ☆ Electric Network Frequency Optical Sensing Devices + + +
+ Electric Network Frequency (ENF) acts as a fingerprint in multimedia +forensics applications. In indoor environments, ENF variations affect the +intensity of light sources connected to power mains. Accordingly, the light +intensity variations captured by sensing devices can be exploited to estimate +the ENF. A first optical sensing device based on a photodiode is developed for +capturing ENF variations in indoor lighting environments. In addition, a device +that captures the ENF directly from power mains is implemented. This device +serves as a ground truth ENF collector. Video recordings captured by a camera +are also employed to estimate the ENF. The camera serves as a second optical +sensor. The factors affecting the ENF estimation are thoroughly studied. The +maximum correlation coefficient between the ENF estimated by the two optical +sensors and that estimated directly from power mains is used to measure the +estimation accuracy. The paper's major contribution is in the disclosure of +extensive experimental evidence on ENF estimation in scenes ranging from static +ones capturing a white wall to non-static ones, including human activity. + +
+
+
+
+
+ + ☆ Robustness-Reinforced Knowledge Distillation with Correlation Distance + and Network Pruning + + +
+ The improvement in the performance of efficient and lightweight models (i.e., +the student model) is achieved through knowledge distillation (KD), which +involves transferring knowledge from more complex models (i.e., the teacher +model). However, most existing KD techniques rely on Kullback-Leibler (KL) +divergence, which has certain limitations. First, if the teacher distribution +has high entropy, the KL divergence's mode-averaging nature hinders the +transfer of sufficient target information. Second, when the teacher +distribution has low entropy, the KL divergence tends to excessively focus on +specific modes, which fails to convey an abundant amount of valuable knowledge +to the student. Consequently, when dealing with datasets that contain numerous +confounding or challenging samples, student models may struggle to acquire +sufficient knowledge, resulting in subpar performance. Furthermore, in previous +KD approaches, we observed that data augmentation, a technique aimed at +enhancing a model's generalization, can have an adverse impact. Therefore, we +propose a Robustness-Reinforced Knowledge Distillation (R2KD) that leverages +correlation distance and network pruning. This approach enables KD to +effectively incorporate data augmentation for performance improvement. +Extensive experiments on various datasets, including CIFAR-100, FGVR, +TinyImagenet, and ImageNet, demonstrate our method's superiority over current +state-of-the-art methods. + +
+
+ comment: 11 pages, 7 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ☆ Periodically Exchange Teacher-Student for Source-Free Object Detection ICCV 2023 + + +
+ Source-free object detection (SFOD) aims to adapt the source detector to +unlabeled target domain data in the absence of source domain data. Most SFOD +methods follow the same self-training paradigm using mean-teacher (MT) +framework where the student model is guided by only one single teacher model. +However, such paradigm can easily fall into a training instability problem that +when the teacher model collapses uncontrollably due to the domain shift, the +student model also suffers drastic performance degradation. To address this +issue, we propose the Periodically Exchange Teacher-Student (PETS) method, a +simple yet novel approach that introduces a multiple-teacher framework +consisting of a static teacher, a dynamic teacher, and a student model. During +the training phase, we periodically exchange the weights between the static +teacher and the student model. Then, we update the dynamic teacher using the +moving average of the student model that has already been exchanged by the +static teacher. In this way, the dynamic teacher can integrate knowledge from +past periods, effectively reducing error accumulation and enabling a more +stable training process within the MT-based framework. Further, we develop a +consensus mechanism to merge the predictions of two teacher models to provide +higher-quality pseudo labels for student model. Extensive experiments on +multiple SFOD benchmarks show that the proposed method achieves +state-of-the-art performance compared with other related methods, demonstrating +the effectiveness and superiority of our method on SFOD task. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ☆ MetaFBP: Learning to Learn High-Order Predictor for Personalized Facial + Beauty Prediction ACM MM 2023 + + +
+ Predicting individual aesthetic preferences holds significant practical +applications and academic implications for human society. However, existing +studies mainly focus on learning and predicting the commonality of facial +attractiveness, with little attention given to Personalized Facial Beauty +Prediction (PFBP). PFBP aims to develop a machine that can adapt to individual +aesthetic preferences with only a few images rated by each user. In this paper, +we formulate this task from a meta-learning perspective that each user +corresponds to a meta-task. To address such PFBP task, we draw inspiration from +the human aesthetic mechanism that visual aesthetics in society follows a +Gaussian distribution, which motivates us to disentangle user preferences into +a commonality and an individuality part. To this end, we propose a novel +MetaFBP framework, in which we devise a universal feature extractor to capture +the aesthetic commonality and then optimize to adapt the aesthetic +individuality by shifting the decision boundary of the predictor via a +meta-learning mechanism. Unlike conventional meta-learning methods that may +struggle with slow adaptation or overfitting to tiny support sets, we propose a +novel approach that optimizes a high-order predictor for fast adaptation. In +order to validate the performance of the proposed method, we build several PFBP +benchmarks by using existing facial beauty prediction datasets rated by +numerous users. Extensive experiments on these benchmarks demonstrate the +effectiveness of the proposed MetaFBP method. + +
+
+ comment: Accepted by ACM MM 2023. Source code: + https://github.com/MetaVisionLab/MetaFBP +
+
+
+
+
+ + ☆ Parameter Exchange for Robust Dynamic Domain Generalization ACM MM 2023 + + +
+ Agnostic domain shift is the main reason of model degradation on the unknown +target domains, which brings an urgent need to develop Domain Generalization +(DG). Recent advances at DG use dynamic networks to achieve training-free +adaptation on the unknown target domains, termed Dynamic Domain Generalization +(DDG), which compensates for the lack of self-adaptability in static models +with fixed weights. The parameters of dynamic networks can be decoupled into a +static and a dynamic component, which are designed to learn domain-invariant +and domain-specific features, respectively. Based on the existing arts, in this +work, we try to push the limits of DDG by disentangling the static and dynamic +components more thoroughly from an optimization perspective. Our main +consideration is that we can enable the static component to learn +domain-invariant features more comprehensively by augmenting the +domain-specific information. As a result, the more comprehensive +domain-invariant features learned by the static component can then enforce the +dynamic component to focus more on learning adaptive domain-specific features. +To this end, we propose a simple yet effective Parameter Exchange (PE) method +to perturb the combination between the static and dynamic components. We +optimize the model using the gradients from both the perturbed and +non-perturbed feed-forward jointly to implicitly achieve the aforementioned +disentanglement. In this way, the two components can be optimized in a +mutually-beneficial manner, which can resist the agnostic domain shifts and +improve the self-adaptability on the unknown target domain. Extensive +experiments show that PE can be easily plugged into existing dynamic networks +to improve their generalization ability without bells and whistles. + +
+
+ comment: Accepted by ACM MM 2023. Source code: + https://github.com/MetaVisionLab/PE +
+
+
+
+
+ + ☆ Predicting Recovery or Decease of COVID-19 Patients with Clinical and + RT-PCR Using Machine Learning Classification Algorithms + + +
+ The COVID-19 pandemic has disrupted the global economy and people's daily +lives in unprecedented ways. To make appropriate decisions, it is necessary to +diagnose COVID-19 rapidly and accurately. Clinical decision making is +influenced by data collected from patients. With the aid of artificial +intelligence, COVID-19 has been diagnosed quickly by analyzing symptoms, +polymerase chain reaction (PCR), computed tomography scans, chest X-rays, +routine laboratory blood tests and even cough sounds. Furthermore, these data +can be used to predict a patient's morality, although there is a question about +which data makes the most accurate predictions. Therefore, this study consists +of two parts. Our first objective is to examine whether machine learning +algorithms can predict the outcome of COVID-19 cases (recovery or death), based +on the features present in the dataset. In the second part of the research, we +investigated the impact of clinical and RT-PCR on prediction of recovery and +decease to determine which one is more reliable. We defined four stages with +different feature sets and use six machine learning methods to build prediction +model. With an accuracy of 78.7%, random forest showed promising results for +predicting death and recovery of patients. Based on this, it appears that +recovery and decease of patients are predictable using machine learning. For +second objective, results indicate that clinical alone (without using RT-PCR), +trained with AdaBoost algorithm, is the most accurate with an accuracy of +82.1%. This study can provide guidance for medical professionals in the event +of a crisis or outbreak similar to COVID-19. + +
+
+
+
+
+ + ☆ Expanding the deep-learning model to diagnosis LVNC: Limitations and + trade-offs + + +
+ Hyper-trabeculation or non-compaction in the left ventricle of the myocardium +(LVNC) is a recently classified form of cardiomyopathy. Several methods have +been proposed to quantify the trabeculae accurately in the left ventricle, but +there is no general agreement in the medical community to use a particular +approach. In previous work, we proposed DL-LVTQ, a deep learning approach for +left ventricular trabecular quantification based on a U-Net CNN architecture. +DL-LVTQ was an automatic diagnosis tool developed from a dataset of patients +with the same cardiomyopathy (hypertrophic cardiomyopathy). + In this work, we have extended and adapted DL-LVTQ to cope with patients with +different cardiomyopathies. The dataset consists of up 379 patients in three +groups with different particularities and cardiomyopathies. Patient images were +taken from different scanners and hospitals. We have modified and adapted the +U-Net convolutional neural network to account for the different particularities +of a heterogeneous group of patients with various unclassifiable or mixed and +inherited cardiomyopathies. + The inclusion of new groups of patients has increased the accuracy, +specificity and kappa values while maintaining the sensitivity of the automatic +deep learning method proposed. Therefore, a better-prepared diagnosis tool is +ready for various cardiomyopathies with different characteristics. +Cardiologists have considered that 98.9% of the evaluated outputs are verified +clinically for diagnosis. Therefore, the high precision to segment the +different cardiac structures allows us to make a robust diagnostic system +objective and faster, decreasing human error and time spent. + +
+
+
+
+
+ + ☆ Query by Activity Video in the Wild ICIP 2023 + + +
+ This paper focuses on activity retrieval from a video query in an imbalanced +scenario. In current query-by-activity-video literature, a common assumption is +that all activities have sufficient labelled examples when learning an +embedding. This assumption does however practically not hold, as only a portion +of activities have many examples, while other activities are only described by +few examples. In this paper, we propose a visual-semantic embedding network +that explicitly deals with the imbalanced scenario for activity retrieval. Our +network contains two novel modules. The visual alignment module performs a +global alignment between the input video and fixed-sized visual bank +representations for all activities. The semantic module performs an alignment +between the input video and fixed-sized semantic activity representations. By +matching videos with both visual and semantic activity representations that are +of equal size over all activities, we no longer ignore infrequent activities +during retrieval. Experiments on a new imbalanced activity retrieval benchmark +show the effectiveness of our approach for all types of activities. + +
+
+ comment: An extended version of ICIP 2023 +
+
+
+
+
+ + ☆ PointPCA+: Extending PointPCA objective quality assessment metric ICIP 2023 + + +
+ A computationally-simplified and descriptor-richer Point Cloud Quality +Assessment (PCQA) metric, namely PointPCA+, is proposed in this paper, which is +an extension of PointPCA. PointPCA proposed a set of perceptually-relevant +descriptors based on PCA decomposition that were applied to both the geometry +and texture data of point clouds for full reference PCQA. PointPCA+ employs PCA +only on the geometry data while enriching existing geometry and texture +descriptors, that are computed more efficiently. Similarly to PointPCA, a total +quality score is obtained through a learning-based fusion of individual +predictions from geometry and texture descriptors that capture local shape and +appearance properties, respectively. Before feature fusion, a feature selection +module is introduced to choose the most effective features from a proposed +super-set. Experimental results show that PointPCA+ achieves high predictive +performance against subjective ground truth scores obtained from publicly +available datasets. The code is available at +\url{https://github.com/cwi-dis/pointpca_suite/}. + +
+
+ comment: ICIP 2023 +
+
+
+
+
+ + ☆ Language-guided Few-shot Semantic Segmentation ICASSP2024 + + +
+ Few-shot learning is a promising way for reducing the label cost in new +categories adaptation with the guidance of a small, well labeled support set. +But for few-shot semantic segmentation, the pixel-level annotations of support +images are still expensive. In this paper, we propose an innovative solution to +tackle the challenge of few-shot semantic segmentation using only language +information, i.e.image-level text labels. Our approach involves a +vision-language-driven mask distillation scheme, which contains a +vision-language pretraining (VLP) model and a mask refiner, to generate high +quality pseudo-semantic masks from text prompts. We additionally introduce a +distributed prototype supervision method and complementary correlation matching +module to guide the model in digging precise semantic relations among support +and query images. The experiments on two benchmark datasets demonstrate that +our method establishes a new baseline for language-guided few-shot semantic +segmentation and achieves competitive results to recent vision-guided methods. + +
+
+ comment: Expanded version for a pending ICASSP2024 submission +
+
+
+
+
+ + ☆ Perceptual Image Compression with Cooperative Cross-Modal Side + Information + + +
+ The explosion of data has resulted in more and more associated text being +transmitted along with images. Inspired by from distributed source coding, many +works utilize image side information to enhance image compression. However, +existing methods generally do not consider using text as side information to +enhance perceptual compression of images, even though the benefits of +multimodal synergy have been widely demonstrated in research. This begs the +following question: How can we effectively transfer text-level semantic +dependencies to help image compression, which is only available to the decoder? +In this work, we propose a novel deep image compression method with text-guided +side information to achieve a better rate-perception-distortion tradeoff. +Specifically, we employ the CLIP text encoder and an effective Semantic-Spatial +Aware block to fuse the text and image features. This is done by predicting a +semantic mask to guide the learned text-adaptive affine transformation at the +pixel level. Furthermore, we design a text-conditional generative adversarial +networks to improve the perceptual quality of reconstructed images. Extensive +experiments involving four datasets and ten image quality assessment metrics +demonstrate that the proposed approach achieves superior results in terms of +rate-perception trade-off and semantic distortion. + +
+
+
+
+
+ + ☆ Progressive Learning with Visual Prompt Tuning for Variable-Rate Image + Compression + + +
+ In this paper, we propose a progressive learning paradigm for +transformer-based variable-rate image compression. Our approach covers a wide +range of compression rates with the assistance of the Layer-adaptive Prompt +Module (LPM). Inspired by visual prompt tuning, we use LPM to extract prompts +for input images and hidden features at the encoder side and decoder side, +respectively, which are fed as additional information into the Swin Transformer +layer of a pre-trained transformer-based image compression model to affect the +allocation of attention region and the bits, which in turn changes the target +compression ratio of the model. To ensure the network is more lightweight, we +involves the integration of prompt networks with less convolutional layers. +Exhaustive experiments show that compared to methods based on multiple models, +which are optimized separately for different target rates, the proposed method +arrives at the same performance with 80% savings in parameter storage and 90% +savings in datasets. Meanwhile, our model outperforms all current variable +bitrate image methods in terms of rate-distortion performance and approaches +the state-of-the-art fixed bitrate image compression methods trained from +scratch. + +
+
+
+
+
+ + ☆ Lego: Learning to Disentangle and Invert Concepts Beyond Object + Appearance in Text-to-Image Diffusion Models + + +
+ Diffusion models have revolutionized generative content creation and +text-to-image (T2I) diffusion models in particular have increased the creative +freedom of users by allowing scene synthesis using natural language. T2I models +excel at synthesizing concepts such as nouns, appearances, and styles. To +enable customized content creation based on a few example images of a concept, +methods such as Textual Inversion and DreamBooth invert the desired concept and +enable synthesizing it in new scenes. However, inverting more general concepts +that go beyond object appearance and style (adjectives and verbs) through +natural language, remains a challenge. Two key characteristics of these +concepts contribute to the limitations of current inversion methods. 1) +Adjectives and verbs are entangled with nouns (subject) and can hinder +appearance-based inversion methods, where the subject appearance leaks into the +concept embedding and 2) describing such concepts often extends beyond single +word embeddings (being frozen in ice, walking on a tightrope, etc.) that +current methods do not handle. + In this study, we introduce Lego, a textual inversion method designed to +invert subject entangled concepts from a few example images. Lego disentangles +concepts from their associated subjects using a simple yet effective Subject +Separation step and employs a Context Loss that guides the inversion of +single/multi-embedding concepts. In a thorough user study, Lego-generated +concepts were preferred over 70% of the time when compared to the baseline. +Additionally, visual question answering using a large language model suggested +Lego-generated concepts are better aligned with the text description of the +concept. + +
+
+
+
+
+ + ♻ ☆ Automatically Score Tissue Images Like a Pathologist by Transfer + Learning + + +
+ Cancer is the second leading cause of death in the world. Diagnosing cancer +early on can save many lives. Pathologists have to look at tissue microarray +(TMA) images manually to identify tumors, which can be time-consuming, +inconsistent and subjective. Existing automatic algorithms either have not +achieved the accuracy level of a pathologist or require substantial human +involvements. A major challenge is that TMA images with different shapes, +sizes, and locations can have the same score. Learning staining patterns in TMA +images requires a huge number of images, which are severely limited due to +privacy and regulation concerns in medical organizations. TMA images from +different cancer types may share certain common characteristics, but combining +them directly harms the accuracy due to heterogeneity in their staining +patterns. Transfer learning is an emerging learning paradigm that allows +borrowing strength from similar problems. However, existing approaches +typically require a large sample from similar learning problems, while TMA +images of different cancer types are often available in small sample size and +further existing algorithms are limited to transfer learning from one similar +problem. We propose a new transfer learning algorithm that could learn from +multiple related problems, where each problem has a small sample and can have a +substantially different distribution from the original one. The proposed +algorithm has made it possible to break the critical accuracy barrier (the 75% +accuracy level of pathologists), with a reported accuracy of 75.9% on breast +cancer TMA images from the Stanford Tissue Microarray Database. It is supported +by recent developments in transfer learning theory and empirical evidence in +clustering technology. This will allow pathologists to confidently adopt +automatic algorithms in recognizing tumors consistently with a higher accuracy +in real time. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Are "Hierarchical" Visual Representations Hierarchical? + + +
+ Learned visual representations often capture large amounts of semantic +information for accurate downstream applications. Human understanding of the +world is fundamentally grounded in hierarchy. To mimic this and further improve +representation capabilities, the community has explored "hierarchical" visual +representations that aim at modeling the underlying hierarchy of the visual +world. In this work, we set out to investigate if hierarchical visual +representations truly capture the human perceived hierarchy better than +standard learned representations. To this end, we create HierNet, a suite of 12 +datasets spanning 3 kinds of hierarchy from the BREEDs subset of ImageNet. +After extensive evaluation of Hyperbolic and Matryoshka Representations across +training setups, we conclude that they do not capture hierarchy any better than +the standard representations but can assist in other aspects like search +efficiency and interpretability. Our benchmark and the datasets are +open-sourced at https://github.com/ethanlshen/HierNet. + +
+
+
+
+
+ + ♻ ☆ A Recent Survey of the Advancements in Deep Learning Techniques for + Monkeypox Disease Detection + + +
+ Monkeypox (MPox) is a zoonotic infectious disease induced by the MPox Virus, +part of the poxviridae orthopoxvirus group initially discovered in Africa and +gained global attention in mid-2022 with cases reported outside endemic areas. +Symptoms include headaches, chills, fever, smallpox, measles, and +chickenpox-like skin manifestations and the WHO officially announced MPox as a +global public health pandemic, in July 2022.Traditionally, PCR testing of skin +lesions is considered a benchmark for the primary diagnosis by WHO, with +symptom management as the primary treatment and antiviral drugs like +tecovirimat for severe cases. However, manual analysis within hospitals poses a +substantial challenge including the substantial burden on healthcare +professionals, limited facilities, availability and fatigue among doctors, and +human error during public health emergencies. Therefore, this survey paper +provides an extensive and efficient analysis of deep learning (DL) methods for +the automatic detection of MPox in skin lesion images. These DL techniques are +broadly grouped into categories, including deep CNN, Deep CNNs ensemble, deep +hybrid learning, the newly developed, and Vision transformer for diagnosing +MPox. Moreover, this study offers a systematic exploration of the evolutionary +progression of DL techniques and identifies, and addresses limitations in +previous methods while highlighting the valuable contributions and innovation. +Additionally, the paper addresses benchmark datasets and their collection from +various authentic sources, pre-processing techniques, and evaluation metrics. +The survey also briefly delves into emerging concepts, identifies research +gaps, limitations, and applications, and outlines challenges in the diagnosis +process. This survey furnishes valuable insights into the prospective areas of +DL innovative ideas and is anticipated to serve as a path for researchers. + +
+
+ comment: 53 pages, 16 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ Transfer Learning-based Real-time Handgun Detection + + +
+ Traditional surveillance systems rely on human attention, limiting their +effectiveness. This study employs convolutional neural networks and transfer +learning to develop a real-time computer vision system for automatic handgun +detection. Comprehensive analysis of online handgun detection methods is +conducted, emphasizing reducing false positives and learning time. Transfer +learning is demonstrated as an effective approach. Despite technical +challenges, the proposed system achieves a precision rate of 84.74%, +demonstrating promising performance comparable to related works, enabling +faster learning and accurate automatic handgun detection for enhanced security. +This research advances security measures by reducing human monitoring +dependence, showcasing the potential of transfer learning-based approaches for +efficient and reliable handgun detection. + +
+
+ comment: 16 pages, 9 figures, and 3 tables. Accepted at The Iraqi Journal of + Science, issued by College of Science at University of Baghdad +
+
+
+
+
+ + ♻ ☆ DRIFu: Differentiable Rendering and Implicit Function-based Single-View + 3D Reconstruction + + +
+ The Differentiable Rendering and Implicit Function-based model (DRIFu) draws +its roots from the Pixel-aligned Implicit Function (PIFU), a pioneering 3D +digitization technique initially designed for clothed human bodies. PIFU excels +in capturing nuanced body shape variations within a low-dimensional space and +has been extensively trained on human 3D scans. However, the application of +PIFU to live animals poses significant challenges, primarily due to the +inherent difficulty in obtaining the cooperation of animals for 3D scanning. In +response to this challenge, we introduce the DRIFu model, specifically tailored +for animal digitization. To train DRIFu, we employ a curated set of synthetic +3D animal models, encompassing diverse shapes, sizes, and even accounting for +variations such as baby birds. Our innovative alignment tools play a pivotal +role in mapping these diverse synthetic animal models onto a unified template, +facilitating precise predictions of animal shape and texture. Crucially, our +template alignment strategy establishes a shared shape space, allowing for the +seamless sampling of new animal shapes, posing them realistically, animating +them, and aligning them with real-world data. This groundbreaking approach +revolutionizes our capacity to comprehensively understand and represent avian +forms. For further details and access to the project, the project website can +be found at https://github.com/kuangzijian/drifu-for-animals + +
+
+ comment: arXiv admin note: text overlap with arXiv:1905.05172 by other authors +
+
+
+
+
+ + ♻ ☆ PF-LRM: Pose-Free Large Reconstruction Model for Joint Pose and Shape + Prediction + + +
+ We propose a Pose-Free Large Reconstruction Model (PF-LRM) for reconstructing +a 3D object from a few unposed images even with little visual overlap, while +simultaneously estimating the relative camera poses in ~1.3 seconds on a single +A100 GPU. PF-LRM is a highly scalable method utilizing the self-attention +blocks to exchange information between 3D object tokens and 2D image tokens; we +predict a coarse point cloud for each view, and then use a differentiable +Perspective-n-Point (PnP) solver to obtain camera poses. When trained on a huge +amount of multi-view posed data of ~1M objects, PF-LRM shows strong +cross-dataset generalization ability, and outperforms baseline methods by a +large margin in terms of pose prediction accuracy and 3D reconstruction quality +on various unseen evaluation datasets. We also demonstrate our model's +applicability in downstream text/image-to-3D task with fast feed-forward +inference. Our project website is at: https://totoro97.github.io/pf-lrm . + +
+
+ comment: Project website: https://totoro97.github.io/pf-lrm ; add more + experiments +
+
+
+
+
+ + ♻ ☆ Pick-a-Pic: An Open Dataset of User Preferences for Text-to-Image + Generation + + +
+ The ability to collect a large dataset of human preferences from +text-to-image users is usually limited to companies, making such datasets +inaccessible to the public. To address this issue, we create a web app that +enables text-to-image users to generate images and specify their preferences. +Using this web app we build Pick-a-Pic, a large, open dataset of text-to-image +prompts and real users' preferences over generated images. We leverage this +dataset to train a CLIP-based scoring function, PickScore, which exhibits +superhuman performance on the task of predicting human preferences. Then, we +test PickScore's ability to perform model evaluation and observe that it +correlates better with human rankings than other automatic evaluation metrics. +Therefore, we recommend using PickScore for evaluating future text-to-image +generation models, and using Pick-a-Pic prompts as a more relevant dataset than +MS-COCO. Finally, we demonstrate how PickScore can enhance existing +text-to-image models via ranking. + +
+
+
+
+
+ + ♻ ☆ Evaluating Object (mis)Detection from a Safety and Reliability + Perspective: Discussion and Measures + + +
+ We argue that object detectors in the safety critical domain should +prioritize detection of objects that are most likely to interfere with the +actions of the autonomous actor. Especially, this applies to objects that can +impact the actor's safety and reliability. To quantify the impact of object +(mis)detection on safety and reliability in the context of autonomous driving, +we propose new object detection measures that reward the correct identification +of objects that are most dangerous and most likely to affect driving decisions. +To achieve this, we build an object criticality model to reward the detection +of the objects based on proximity, orientation, and relative velocity with +respect to the subject vehicle. Then, we apply our model on the recent +autonomous driving dataset nuScenes, and we compare nine object detectors. +Results show that, in several settings, object detectors that perform best +according to the nuScenes ranking are not the preferable ones when the focus is +shifted on safety and reliability. + +
+
+ comment: journal version, open access +
+
+
+
+
+ + ♻ ☆ Adaptive Self-Training for Object Detection + + +
+ Deep learning has emerged as an effective solution for solving the task of +object detection in images but at the cost of requiring large labeled datasets. +To mitigate this cost, semi-supervised object detection methods, which consist +in leveraging abundant unlabeled data, have been proposed and have already +shown impressive results. However, most of these methods require linking a +pseudo-label to a ground-truth object by thresholding. In previous works, this +threshold value is usually determined empirically, which is time consuming, and +only done for a single data distribution. When the domain, and thus the data +distribution, changes, a new and costly parameter search is necessary. In this +work, we introduce our method Adaptive Self-Training for Object Detection +(ASTOD), which is a simple yet effective teacher-student method. ASTOD +determines without cost a threshold value based directly on the ground value of +the score histogram. To improve the quality of the teacher predictions, we also +propose a novel pseudo-labeling procedure. We use different views of the +unlabeled images during the pseudo-labeling step to reduce the number of missed +predictions and thus obtain better candidate labels. Our teacher and our +student are trained separately, and our method can be used in an iterative +fashion by replacing the teacher by the student. On the MS-COCO dataset, our +method consistently performs favorably against state-of-the-art methods that do +not require a threshold parameter, and shows competitive results with methods +that require a parameter sweep search. Additional experiments with respect to a +supervised baseline on the DIOR dataset containing satellite images lead to +similar conclusions, and prove that it is possible to adapt the score threshold +automatically in self-training, regardless of the data distribution. The code +is available at https:// github.com/rvandeghen/ASTOD + +
+
+ comment: 10 pages, 4 figures, 5 tables, 1 page of supplementary material +
+
+
+
+
+ + ♻ ☆ Mobile-Seed: Joint Semantic Segmentation and Boundary Detection for + Mobile Robots 3DV + + +
+ Precise and rapid delineation of sharp boundaries and robust semantics is +essential for numerous downstream robotic tasks, such as robot grasping and +manipulation, real-time semantic mapping, and online sensor calibration +performed on edge computing units. Although boundary detection and semantic +segmentation are complementary tasks, most studies focus on lightweight models +for semantic segmentation but overlook the critical role of boundary detection. +In this work, we introduce Mobile-Seed, a lightweight, dual-task framework +tailored for simultaneous semantic segmentation and boundary detection. Our +framework features a two-stream encoder, an active fusion decoder (AFD) and a +dual-task regularization approach. The encoder is divided into two pathways: +one captures category-aware semantic information, while the other discerns +boundaries from multi-scale features. The AFD module dynamically adapts the +fusion of semantic and boundary information by learning channel-wise +relationships, allowing for precise weight assignment of each channel. +Furthermore, we introduce a regularization loss to mitigate the conflicts in +dual-task learning and deep diversity supervision. Compared to existing +methods, the proposed Mobile-Seed offers a lightweight framework to +simultaneously improve semantic segmentation performance and accurately locate +object boundaries. Experiments on the Cityscapes dataset have shown that +Mobile-Seed achieves notable improvement over the state-of-the-art (SOTA) +baseline by 2.2 percentage points (pp) in mIoU and 4.2 pp in mF-score, while +maintaining an online inference speed of 23.9 frames-per-second (FPS) with +1024x2048 resolution input on an RTX 2080 Ti GPU. Additional experiments on +CamVid and PASCAL Context datasets confirm our method's generalizability. Code +and additional results are publicly available at +https://whu-usi3dv.github.io/Mobile-Seed/. + +
+
+ comment: 8 pages, IEEE conference/letter underreview. Code and additional + results are available at: https://github.com/WHU-USI3DV/Mobile-Seed +
+
+
+
+
+ + ♻ ☆ XTransCT: Ultra-Fast Volumetric CT Reconstruction using Two Orthogonal + X-Ray Projections for Image-guided Radiation Therapy via a Transformer + Network + + +
+ Computed tomography (CT) scans offer a detailed, three-dimensional +representation of patients' internal organs. However, conventional CT +reconstruction techniques necessitate acquiring hundreds or thousands of x-ray +projections through a complete rotational scan of the body, making navigation +or positioning during surgery infeasible. In image-guided radiation therapy, a +method that reconstructs ultra-sparse X-ray projections into CT images, we can +exploit the substantially reduced radiation dose and minimize equipment burden +for localization and navigation. In this study, we introduce a novel +Transformer architecture, termed XTransCT, devised to facilitate real-time +reconstruction of CT images from two-dimensional X-ray images. We assess our +approach regarding image quality and structural reliability using a dataset of +fifty patients, supplied by a hospital, as well as the larger public dataset +LIDC-IDRI, which encompasses thousands of patients. Additionally, we validated +our algorithm's generalizability on the LNDb dataset. Our findings indicate +that our algorithm surpasses other methods in image quality, structural +precision, and generalizability. Moreover, in comparison to previous 3D +convolution-based approaches, we note a substantial speed increase of +approximately 300 %, achieving 44 ms per 3D image reconstruction. + +
+
+
+
+
+ + ♻ ☆ EDDense-Net: Fully Dense Encoder Decoder Network for Joint Segmentation + of Optic Cup and Disc + + +
+ Glaucoma is an eye disease that causes damage to the optic nerve, which can +lead to visual loss and permanent blindness. Early glaucoma detection is +therefore critical in order to avoid permanent blindness. The estimation of the +cup-to-disc ratio (CDR) during an examination of the optical disc (OD) is used +for the diagnosis of glaucoma. In this paper, we present the EDDense-Net +segmentation network for the joint segmentation of OC and OD. The encoder and +decoder in this network are made up of dense blocks with a grouped +convolutional layer in each block, allowing the network to acquire and convey +spatial information from the image while simultaneously reducing the network's +complexity. To reduce spatial information loss, the optimal number of filters +in all convolution layers were utilised. In semantic segmentation, dice pixel +classification is employed in the decoder to alleviate the problem of class +imbalance. The proposed network was evaluated on two publicly available +datasets where it outperformed existing state-of-the-art methods in terms of +accuracy and efficiency. For the diagnosis and analysis of glaucoma, this +method can be used as a second opinion system to assist medical +ophthalmologists. + +
+
+
+
+
+ + ♻ ☆ Zero-shot Visual Relation Detection via Composite Visual Cues from Large + Language Models + + +
+ Pretrained vision-language models, such as CLIP, have demonstrated strong +generalization capabilities, making them promising tools in the realm of +zero-shot visual recognition. Visual relation detection (VRD) is a typical task +that identifies relationship (or interaction) types between object pairs within +an image. However, naively utilizing CLIP with prevalent class-based prompts +for zero-shot VRD has several weaknesses, e.g., it struggles to distinguish +between different fine-grained relation types and it neglects essential spatial +information of two objects. To this end, we propose a novel method for +zero-shot VRD: RECODE, which solves RElation detection via COmposite +DEscription prompts. Specifically, RECODE first decomposes each predicate +category into subject, object, and spatial components. Then, it leverages large +language models (LLMs) to generate description-based prompts (or visual cues) +for each component. Different visual cues enhance the discriminability of +similar relation categories from different perspectives, which significantly +boosts performance in VRD. To dynamically fuse different cues, we further +introduce a chain-of-thought method that prompts LLMs to generate reasonable +weights for different visual cues. Extensive experiments on four VRD benchmarks +have demonstrated the effectiveness and interpretability of RECODE. + +
+
+
+
+
+ + ♻ ☆ 3D helical CT Reconstruction with a Memory Efficient Learned Primal-Dual + Architecture + + +
+ Deep learning based computed tomography (CT) reconstruction has demonstrated +outstanding performance on simulated 2D low-dose CT data. This applies in +particular to domain adapted neural networks, which incorporate a handcrafted +physics model for CT imaging. Empirical evidence shows that employing such +architectures reduces the demand for training data and improves upon +generalisation. However, their training requires large computational resources +that quickly become prohibitive in 3D helical CT, which is the most common +acquisition geometry used for medical imaging. Furthermore, clinical data also +comes with other challenges not accounted for in simulations, like errors in +flux measurement, resolution mismatch and, most importantly, the absence of the +real ground truth. The necessity to have a computationally feasible training +combined with the need to address these issues has made it difficult to +evaluate deep learning based reconstruction on clinical 3D helical CT. This +paper modifies a domain adapted neural network architecture, the Learned +Primal-Dual (LPD), so that it can be trained and applied to reconstruction in +this setting. We achieve this by splitting the helical trajectory into sections +and applying the unrolled LPD iterations to those sections sequentially. To the +best of our knowledge, this work is the first to apply an unrolled deep +learning architecture for reconstruction on full-sized clinical data, like +those in the Low dose CT image and projection data set (LDCT). Moreover, +training and testing is done on a single GPU card with 24GB of memory. + +
+
+
+
+
+ + ♻ ☆ Divide and Conquer: 3D Point Cloud Instance Segmentation With Point-Wise + Binarization + + +
+ Instance segmentation on point clouds is crucially important for 3D scene +understanding. Most SOTAs adopt distance clustering, which is typically +effective but does not perform well in segmenting adjacent objects with the +same semantic label (especially when they share neighboring points). Due to the +uneven distribution of offset points, these existing methods can hardly cluster +all instance points. To this end, we design a novel divide-and-conquer strategy +named PBNet that binarizes each point and clusters them separately to segment +instances. Our binary clustering divides offset instance points into two +categories: high and low density points (HPs vs. LPs). Adjacent objects can be +clearly separated by removing LPs, and then be completed and refined by +assigning LPs via a neighbor voting method. To suppress potential +over-segmentation, we propose to construct local scenes with the weight mask +for each instance. As a plug-in, the proposed binary clustering can replace +traditional distance clustering and lead to consistent performance gains on +many mainstream baselines. A series of experiments on ScanNetV2 and S3DIS +datasets indicate the superiority of our model. In particular, PBNet ranks +first on the ScanNetV2 official benchmark challenge, achieving the highest mAP. +Code will be available publicly at https://github.com/weiguangzhao/PBNet. + +
+
+
+
+
+ + ♻ ☆ Using Human Feedback to Fine-tune Diffusion Models without Any Reward + Model + + +
+ Using reinforcement learning with human feedback (RLHF) has shown significant +promise in fine-tuning diffusion models. Previous methods start by training a +reward model that aligns with human preferences, then leverage RL techniques to +fine-tune the underlying models. However, crafting an efficient reward model +demands extensive datasets, optimal architecture, and manual hyperparameter +tuning, making the process both time and cost-intensive. The direct preference +optimization (DPO) method, effective in fine-tuning large language models, +eliminates the necessity for a reward model. However, the extensive GPU memory +requirement of the diffusion model's denoising process hinders the direct +application of the DPO method. To address this issue, we introduce the Direct +Preference for Denoising Diffusion Policy Optimization (D3PO) method to +directly fine-tune diffusion models. The theoretical analysis demonstrates that +although D3PO omits training a reward model, it effectively functions as the +optimal reward model trained using human feedback data to guide the learning +process. This approach requires no training of a reward model, proving to be +more direct, cost-effective, and minimizing computational overhead. In +experiments, our method uses the relative scale of objectives as a proxy for +human preference, delivering comparable results to methods using ground-truth +rewards. Moreover, D3PO demonstrates the ability to reduce image distortion +rates and generate safer images, overcoming challenges lacking robust reward +models. Our code is publicly available in +https://github.com/yk7333/D3PO/tree/main. + +
+
+
+
+
+ + ♻ ☆ Prompt-based test-time real image dehazing: a novel pipeline + + +
+ Existing methods attempt to improve models' generalization ability on +real-world hazy images by exploring well-designed training schemes (e.g., +CycleGAN, prior loss). However, most of them need very complicated training +procedures to achieve satisfactory results. In this work, we present a totally +novel testing pipeline called Prompt-based Test-Time Dehazing (PTTD) to help +generate visually pleasing results of real-captured hazy images during the +inference phase. We experimentally find that given a dehazing model trained on +synthetic data, by fine-tuning the statistics (i.e., mean and standard +deviation) of encoding features, PTTD is able to narrow the domain gap, +boosting the performance of real image dehazing. Accordingly, we first apply a +prompt generation module (PGM) to generate a visual prompt, which is the source +of appropriate statistical perturbations for mean and standard deviation. And +then, we employ the feature adaptation module (FAM) into the existing dehazing +models for adjusting the original statistics with the guidance of the generated +prompt. Note that, PTTD is model-agnostic and can be equipped with various +state-of-the-art dehazing models trained on synthetic hazy-clean pairs. +Extensive experimental results demonstrate that our PTTD is flexible meanwhile +achieves superior performance against state-of-the-art dehazing methods in +real-world scenarios. The source code of our PTTD will be made available at +https://github.com/cecret3350/PTTD-Dehazing. + +
+
+ comment: update github link (https://github.com/cecret3350/PTTD-Dehazing) +
+
+
+
+
+ + ♻ ☆ HRS-Bench: Holistic, Reliable and Scalable Benchmark for Text-to-Image + Models ICCV 2023 + + +
+ In recent years, Text-to-Image (T2I) models have been extensively studied, +especially with the emergence of diffusion models that achieve state-of-the-art +results on T2I synthesis tasks. However, existing benchmarks heavily rely on +subjective human evaluation, limiting their ability to holistically assess the +model's capabilities. Furthermore, there is a significant gap between efforts +in developing new T2I architectures and those in evaluation. To address this, +we introduce HRS-Bench, a concrete evaluation benchmark for T2I models that is +Holistic, Reliable, and Scalable. Unlike existing bench-marks that focus on +limited aspects, HRS-Bench measures 13 skills that can be categorized into five +major categories: accuracy, robustness, generalization, fairness, and bias. In +addition, HRS-Bench covers 50 scenarios, including fashion, animals, +transportation, food, and clothes. We evaluate nine recent large-scale T2I +models using metrics that cover a wide range of skills. A human evaluation +aligned with 95% of our evaluations on average was conducted to probe the +effectiveness of HRS-Bench. Our experiments demonstrate that existing models +often struggle to generate images with the desired count of objects, visual +text, or grounded emotions. We hope that our benchmark help ease future +text-to-image generation research. The code and data are available at +https://eslambakr.github.io/hrsbench.github.io + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ LucidDreamer: Domain-free Generation of 3D Gaussian Splatting Scenes + + +
+ With the widespread usage of VR devices and contents, demands for 3D scene +generation techniques become more popular. Existing 3D scene generation models, +however, limit the target scene to specific domain, primarily due to their +training strategies using 3D scan dataset that is far from the real-world. To +address such limitation, we propose LucidDreamer, a domain-free scene +generation pipeline by fully leveraging the power of existing large-scale +diffusion-based generative model. Our LucidDreamer has two alternate steps: +Dreaming and Alignment. First, to generate multi-view consistent images from +inputs, we set the point cloud as a geometrical guideline for each image +generation. Specifically, we project a portion of point cloud to the desired +view and provide the projection as a guidance for inpainting using the +generative model. The inpainted images are lifted to 3D space with estimated +depth maps, composing a new points. Second, to aggregate the new points into +the 3D scene, we propose an aligning algorithm which harmoniously integrates +the portions of newly generated 3D scenes. The finally obtained 3D scene serves +as initial points for optimizing Gaussian splats. LucidDreamer produces +Gaussian splats that are highly-detailed compared to the previous 3D scene +generation methods, with no constraint on domain of the target scene. Project +page: https://luciddreamer-cvlab.github.io/ + +
+
+ comment: Project page: https://luciddreamer-cvlab.github.io/ +
+
+
+
+
+ + ♻ ☆ CoT3DRef: Chain-of-Thoughts Data-Efficient 3D Visual Grounding + + +
+ 3D visual grounding is the ability to localize objects in 3D scenes +conditioned by utterances. Most existing methods devote the referring head to +localize the referred object directly, causing failure in complex scenarios. In +addition, it does not illustrate how and why the network reaches the final +decision. In this paper, we address this question Can we design an +interpretable 3D visual grounding framework that has the potential to mimic the +human perception system?. To this end, we formulate the 3D visual grounding +problem as a sequence-to-sequence task by first predicting a chain of anchors +and then the final target. Interpretability not only improves the overall +performance but also helps us identify failure cases. Following the chain of +thoughts approach enables us to decompose the referring task into interpretable +intermediate steps, boosting the performance and making our framework extremely +data-efficient. Moreover, our proposed framework can be easily integrated into +any existing architecture. We validate our approach through comprehensive +experiments on the Nr3D, Sr3D, and Scanrefer benchmarks and show consistent +performance gains compared to existing methods without requiring manually +annotated data. Furthermore, our proposed framework, dubbed CoT3DRef, is +significantly data-efficient, whereas on the Sr3D dataset, when trained only on +10% of the data, we match the SOTA performance that trained on the entire data. + +
+
+
+
+
+ + ♻ ☆ Large Scale Time-Series Representation Learning via Simultaneous Low and + High Frequency Feature Bootstrapping + + +
+ Learning representation from unlabeled time series data is a challenging +problem. Most existing self-supervised and unsupervised approaches in the +time-series domain do not capture low and high-frequency features at the same +time. Further, some of these methods employ large scale models like +transformers or rely on computationally expensive techniques such as +contrastive learning. To tackle these problems, we propose a non-contrastive +self-supervised learning approach efficiently captures low and high-frequency +time-varying features in a cost-effective manner. Our method takes raw time +series data as input and creates two different augmented views for two branches +of the model, by randomly sampling the augmentations from same family. +Following the terminology of BYOL, the two branches are called online and +target network which allows bootstrapping of the latent representation. In +contrast to BYOL, where a backbone encoder is followed by multilayer perceptron +(MLP) heads, the proposed model contains additional temporal convolutional +network (TCN) heads. As the augmented views are passed through large kernel +convolution blocks of the encoder, the subsequent combination of MLP and TCN +enables an effective representation of low as well as high-frequency +time-varying features due to the varying receptive fields. The two modules (MLP +and TCN) act in a complementary manner. We train an online network where each +module learns to predict the outcome of the respective module of target network +branch. To demonstrate the robustness of our model we performed extensive +experiments and ablation studies on five real-world time-series datasets. Our +method achieved state-of-art performance on all five real-world datasets. + +
+
+
+
+
+ + ♻ ☆ "HoVer-UNet": Accelerating HoVerNet with UNet-based multi-class nuclei + segmentation via knowledge distillation + + +
+ We present "HoVer-UNet", an approach to distill the knowledge of the +multi-branch HoVerNet framework for nuclei instance segmentation and +classification in histopathology. We propose a compact, streamlined single UNet +network with a Mix Vision Transformer backbone, and equip it with a custom loss +function to optimally encode the distilled knowledge of HoVerNet, reducing +computational requirements without compromising performances. We show that our +model achieved results comparable to HoVerNet on the public PanNuke and Consep +datasets with a three-fold reduction in inference time. We make the code of our +model publicly available at https://github.com/DIAGNijmegen/HoVer-UNet. + +
+
+ comment: 4 pages, 2 figures, submitted to ISBI 2024 +
+
+
+
+
+ + ♻ ☆ Instant3D: Fast Text-to-3D with Sparse-View Generation and Large + Reconstruction Model + + +
+ Text-to-3D with diffusion models has achieved remarkable progress in recent +years. However, existing methods either rely on score distillation-based +optimization which suffer from slow inference, low diversity and Janus +problems, or are feed-forward methods that generate low-quality results due to +the scarcity of 3D training data. In this paper, we propose Instant3D, a novel +method that generates high-quality and diverse 3D assets from text prompts in a +feed-forward manner. We adopt a two-stage paradigm, which first generates a +sparse set of four structured and consistent views from text in one shot with a +fine-tuned 2D text-to-image diffusion model, and then directly regresses the +NeRF from the generated images with a novel transformer-based sparse-view +reconstructor. Through extensive experiments, we demonstrate that our method +can generate diverse 3D assets of high visual quality within 20 seconds, which +is two orders of magnitude faster than previous optimization-based methods that +can take 1 to 10 hours. Our project webpage: https://jiahao.ai/instant3d/. + +
+
+ comment: Project webpage: https://jiahao.ai/instant3d/ +
+
+
+
+
+ + ♻ ☆ Feature Perturbation Augmentation for Reliable Evaluation of Importance + Estimators in Neural Networks + + +
+ Post-hoc explanation methods attempt to make the inner workings of deep +neural networks more interpretable. However, since a ground truth is in general +lacking, local post-hoc interpretability methods, which assign importance +scores to input features, are challenging to evaluate. One of the most popular +evaluation frameworks is to perturb features deemed important by an +interpretability method and to measure the change in prediction accuracy. +Intuitively, a large decrease in prediction accuracy would indicate that the +explanation has correctly quantified the importance of features with respect to +the prediction outcome (e.g., logits). However, the change in the prediction +outcome may stem from perturbation artifacts, since perturbed samples in the +test dataset are out of distribution (OOD) compared to the training dataset and +can therefore potentially disturb the model in an unexpected manner. To +overcome this challenge, we propose feature perturbation augmentation (FPA) +which creates and adds perturbed images during the model training. Through +extensive computational experiments, we demonstrate that FPA makes deep neural +networks (DNNs) more robust against perturbations. Furthermore, training DNNs +with FPA demonstrate that the sign of importance scores may explain the model +more meaningfully than has previously been assumed. Overall, FPA is an +intuitive data augmentation technique that improves the evaluation of post-hoc +interpretability methods. + +
+
+
+
+
+ + ♻ ☆ Multi-modal In-Context Learning Makes an Ego-evolving Scene Text + Recognizer + + +
+ Scene text recognition (STR) in the wild frequently encounters challenges +when coping with domain variations, font diversity, shape deformations, etc. A +straightforward solution is performing model fine-tuning tailored to a specific +scenario, but it is computationally intensive and requires multiple model +copies for various scenarios. Recent studies indicate that large language +models (LLMs) can learn from a few demonstration examples in a training-free +manner, termed "In-Context Learning" (ICL). Nevertheless, applying LLMs as a +text recognizer is unacceptably resource-consuming. Moreover, our pilot +experiments on LLMs show that ICL fails in STR, mainly attributed to the +insufficient incorporation of contextual information from diverse samples in +the training stage. To this end, we introduce E$^2$STR, a STR model trained +with context-rich scene text sequences, where the sequences are generated via +our proposed in-context training strategy. E$^2$STR demonstrates that a +regular-sized model is sufficient to achieve effective ICL capabilities in STR. +Extensive experiments show that E$^2$STR exhibits remarkable training-free +adaptation in various scenarios and outperforms even the fine-tuned +state-of-the-art approaches on public benchmarks. + +
+
+
+
+
+ + ♻ ☆ Associative Transformer Is A Sparse Representation Learner + + +
+ Emerging from the monolithic pairwise attention mechanism in conventional +Transformer models, there is a growing interest in leveraging sparse +interactions that align more closely with biological principles. Approaches +including the Set Transformer and the Perceiver employ cross-attention +consolidated with a latent space that forms an attention bottleneck with +limited capacity. Building upon recent neuroscience studies of Global Workspace +Theory and associative memory, we propose the Associative Transformer (AiT). +AiT induces low-rank explicit memory that serves as both priors to guide +bottleneck attention in the shared workspace and attractors within associative +memory of a Hopfield network. Through joint end-to-end training, these priors +naturally develop module specialization, each contributing a distinct inductive +bias to form attention bottlenecks. A bottleneck can foster competition among +inputs for writing information into the memory. We show that AiT is a sparse +representation learner, learning distinct priors through the bottlenecks that +are complexity-invariant to input quantities and dimensions. AiT demonstrates +its superiority over methods such as the Set Transformer, Vision Transformer, +and Coordination in various vision tasks. + +
+
+
+
+
+
+
+
+ + Information Retrieval 6 + +
+
+
+ + ☆ AI-Generated Images Introduce Invisible Relevance Bias to Text-Image + Retrieval + + +
+ With the advancement of generation models, AI-generated content (AIGC) is +becoming more realistic, flooding the Internet. A recent study suggests that +this phenomenon has elevated the issue of source bias in text retrieval for web +searches. Specifically, neural retrieval models tend to rank generated texts +higher than human-written texts. In this paper, we extend the study of this +bias to cross-modal retrieval. Firstly, we successfully construct a suitable +benchmark to explore the existence of the bias. Subsequent extensive +experiments on this benchmark reveal that AI-generated images introduce an +invisible relevance bias to text-image retrieval models. Specifically, our +experiments show that text-image retrieval models tend to rank the AI-generated +images higher than the real images, even though the AI-generated images do not +exhibit more visually relevant features to the query than real images. This +invisible relevance bias is prevalent across retrieval models with varying +training data and architectures. Furthermore, our subsequent exploration +reveals that the inclusion of AI-generated images in the training data of the +retrieval models exacerbates the invisible relevance bias. The above phenomenon +triggers a vicious cycle, which makes the invisible relevance bias become more +and more serious. To elucidate the potential causes of invisible relevance and +address the aforementioned issues, we introduce an effective training method +aimed at alleviating the invisible relevance bias. Subsequently, we apply our +proposed debiasing method to retroactively identify the causes of invisible +relevance, revealing that the AI-generated images induce the image encoder to +embed additional information into their representation. This information +exhibits a certain consistency across generated images with different semantics +and can make the retriever estimate a higher relevance score. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Some Like It Small: Czech Semantic Embedding Models for Industry + Applications + + +
+ This article focuses on the development and evaluation of Small-sized Czech +sentence embedding models. Small models are important components for real-time +industry applications in resource-constrained environments. Given the limited +availability of labeled Czech data, alternative approaches, including +pre-training, knowledge distillation, and unsupervised contrastive fine-tuning, +are investigated. Comprehensive intrinsic and extrinsic analyses are conducted, +showcasing the competitive performance of our models compared to significantly +larger counterparts, with approximately 8 times smaller size and 5 times faster +speed than conventional Base-sized models. To promote cooperation and +reproducibility, both the models and the evaluation pipeline are made publicly +accessible. Ultimately, this article presents practical applications of the +developed sentence embedding models in Seznam.cz, the Czech search engine. +These models have effectively replaced previous counterparts, enhancing the +overall search experience for instance, in organic search, featured snippets, +and image search. This transition has yielded improved performance. + +
+
+ comment: Accepted at the Thirty-Sixth Annual Conference on Innovative + Applications of Artificial Intelligence (IAAI-24). IAAI Innovative + Application Award. 9 pages +
+
+
+
+
+ + ♻ ☆ Semantic Modelling of Organizational Knowledge as a Basis for Enterprise + Data Governance 4.0 -- Application to a Unified Clinical Data Model + + +
+ Individuals and organizations cope with an always-growing amount of data, +which is heterogeneous in its contents and formats. An adequate data management +process yielding data quality and control over its lifecycle is a prerequisite +to getting value out of this data and minimizing inherent risks related to +multiple usages. Common data governance frameworks rely on people, policies, +and processes that fall short of the overwhelming complexity of data. Yet, +harnessing this complexity is necessary to achieve high-quality standards. The +latter will condition any downstream data usage outcome, including generative +artificial intelligence trained on this data. In this paper, we report our +concrete experience establishing a simple, cost-efficient framework that +enables metadata-driven, agile and (semi-)automated data governance (i.e. Data +Governance 4.0). We explain how we implement and use this framework to +integrate 25 years of clinical study data at an enterprise scale in a fully +productive environment. The framework encompasses both methodologies and +technologies leveraging semantic web principles. We built a knowledge graph +describing avatars of data assets in their business context, including +governance principles. Multiple ontologies articulated by an enterprise upper +ontology enable key governance actions such as FAIRification, lifecycle +management, definition of roles and responsibilities, lineage across +transformations and provenance from source systems. This metadata model is the +keystone to data governance 4.0: a semi-automatised data management process +that considers the business context in an agile manner to adapt governance +constraints to each use case and dynamically tune it based on business changes. + +
+
+
+
+
+ + ♻ ☆ A Privacy Preserving System for Movie Recommendations Using Federated + Learning + + +
+ Recommender systems have become ubiquitous in the past years. They solve the +tyranny of choice problem faced by many users, and are utilized by many online +businesses to drive engagement and sales. Besides other criticisms, like +creating filter bubbles within social networks, recommender systems are often +reproved for collecting considerable amounts of personal data. However, to +personalize recommendations, personal information is fundamentally required. A +recent distributed learning scheme called federated learning has made it +possible to learn from personal user data without its central collection. +Consequently, we present a recommender system for movie recommendations, which +provides privacy and thus trustworthiness on multiple levels: First and +foremost, it is trained using federated learning and thus, by its very nature, +privacy-preserving, while still enabling users to benefit from global insights. +Furthermore, a novel federated learning scheme, called FedQ, is employed, which +not only addresses the problem of non-i.i.d.-ness and small local datasets, but +also prevents input data reconstruction attacks by aggregating client updates +early. Finally, to reduce the communication overhead, compression is applied, +which significantly compresses the exchanged neural network parametrizations to +a fraction of their original size. We conjecture that this may also improve +data privacy through its lossy quantization stage. + +
+
+ comment: Accepted for publication in the ACM Transactions on Recommender + Systems (TORS) Special Issue on Trustworthy Recommender Systems +
+
+
+
+
+ + ♻ ☆ Towards Open-world Cross-Domain Sequential Recommendation: A + Model-Agnostic Contrastive Denoising Approach + + +
+ Cross-domain sequential recommendation (CDSR) aims to address the data +sparsity problems that exist in traditional sequential recommendation (SR) +systems. + The existing approaches aim to design a specific cross-domain unit that can +transfer and propagate information across multiple domains by relying on +overlapping users with abundant behaviors. However, in real-world recommender +systems, CDSR scenarios usually consist of a majority of long-tailed users with +sparse behaviors and cold-start users who only exist in one domain. This leads +to a drop in the performance of existing CDSR methods in the real-world +industry platform. Therefore, improving the consistency and effectiveness of +models in open-world CDSR scenarios is crucial for constructing CDSR models +(\textit{1st} CH). Recently, some SR approaches have utilized auxiliary +behaviors to complement the information for long-tailed users. However, these +multi-behavior SR methods cannot deliver promising performance in CDSR, as they +overlook the semantic gap between target and auxiliary behaviors, as well as +user interest deviation across domains (\textit{2nd} CH). + +
+
+
+
+
+ + ♻ ☆ Generating Natural Language Queries for More Effective Systematic Review + Screening Prioritisation SIGIR + + +
+ Screening prioritisation in medical systematic reviews aims to rank the set +of documents retrieved by complex Boolean queries. Prioritising the most +important documents ensures that subsequent review steps can be carried out +more efficiently and effectively. The current state of the art uses the final +title of the review as a query to rank the documents using BERT-based neural +rankers. However, the final title is only formulated at the end of the review +process, which makes this approach impractical as it relies on ex post facto +information. At the time of screening, only a rough working title is available, +with which the BERT-based ranker performs significantly worse than with the +final title. In this paper, we explore alternative sources of queries for +prioritising screening, such as the Boolean query used to retrieve the +documents to be screened and queries generated by instruction-based generative +large-scale language models such as ChatGPT and Alpaca. Our best approach is +not only viable based on the information available at the time of screening, +but also has similar effectiveness to the final title. + +
+
+ comment: Preprints for Accepted paper in SIGIR-AP-2023, note that this is + updated from ACM published paper. The working title was wrong in the + ACM-published version due to a bug in data preprocessing; however, this does + not have any influence on the final conclusion/observation made from the + paper +
+
+
+
+
+
+
+
+ + Machine Learning 43 + +
+
+
+ + ☆ Robust and Interpretable COVID-19 Diagnosis on Chest X-ray Images using + Adversarial Training + + +
+ The novel 2019 Coronavirus disease (COVID-19) global pandemic is a defining +health crisis. Recent efforts have been increasingly directed towards achieving +quick and accurate detection of COVID-19 across symptomatic patients to +mitigate the intensity and spread of the disease. Artificial intelligence (AI) +algorithms applied to chest X-ray (CXR) images have emerged as promising +diagnostic tools, and previous work has demonstrated impressive classification +performances. However, such methods have faced criticisms from physicians due +to their black-box reasoning process and unpredictable nature. In contrast to +professional radiologist diagnosis, AI systems often lack generalizability, +explainability, and robustness in the clinical decision making process. In our +work, we address these issues by first proposing an extensive baseline study, +training and evaluating 21 convolutional neural network (CNN) models on a +diverse set of 33,000+ CXR images to classify between healthy, COVID-19, and +non-COVID-19 pneumonia CXRs. Our resulting models achieved a 3-way +classification accuracy, recall, and precision of up to 97.03\%, 97.97\%, and +99.95\%, respectively. Next, we investigate the effectiveness of adversarial +training on model robustness and explainability via Gradient-weighted Class +Activation Mapping (Grad-CAM) heatmaps. We find that adversarially trained +models not only significantly outperform their standard counterparts on +classifying perturbed images, but also yield saliency maps that 1) better +specify clinically relevant features, 2) are robust against extraneous +artifacts, and 3) agree considerably more with expert radiologist findings. + +
+
+
+
+
+ + ☆ Risk Bounds of Accelerated SGD for Overparameterized Linear Regression + + +
+ Accelerated stochastic gradient descent (ASGD) is a workhorse in deep +learning and often achieves better generalization performance than SGD. +However, existing optimization theory can only explain the faster convergence +of ASGD, but cannot explain its better generalization. In this paper, we study +the generalization of ASGD for overparameterized linear regression, which is +possibly the simplest setting of learning with overparameterization. We +establish an instance-dependent excess risk bound for ASGD within each +eigen-subspace of the data covariance matrix. Our analysis shows that (i) ASGD +outperforms SGD in the subspace of small eigenvalues, exhibiting a faster rate +of exponential decay for bias error, while in the subspace of large +eigenvalues, its bias error decays slower than SGD; and (ii) the variance error +of ASGD is always larger than that of SGD. Our result suggests that ASGD can +outperform SGD when the difference between the initialization and the true +weight vector is mostly confined to the subspace of small eigenvalues. +Additionally, when our analysis is specialized to linear regression in the +strongly convex setting, it yields a tighter bound for bias error than the +best-known result. + +
+
+ comment: 85 pages, 5 figures +
+
+
+
+
+ + ☆ Assumption-lean and Data-adaptive Post-Prediction Inference + + +
+ A primary challenge facing modern scientific research is the limited +availability of gold-standard data which can be both costly and labor-intensive +to obtain. With the rapid development of machine learning (ML), scientists have +relied on ML algorithms to predict these gold-standard outcomes with easily +obtained covariates. However, these predicted outcomes are often used directly +in subsequent statistical analyses, ignoring imprecision and heterogeneity +introduced by the prediction procedure. This will likely result in false +positive findings and invalid scientific conclusions. In this work, we +introduce an assumption-lean and data-adaptive Post-Prediction Inference +(POP-Inf) procedure that allows valid and powerful inference based on +ML-predicted outcomes. Its "assumption-lean" property guarantees reliable +statistical inference without assumptions on the ML-prediction, for a wide +range of statistical quantities. Its "data-adaptive'" feature guarantees an +efficiency gain over existing post-prediction inference methods, regardless of +the accuracy of ML-prediction. We demonstrate the superiority and applicability +of our method through simulations and large-scale genomic data. + +
+
+
+
+
+ + ☆ Extending Variability-Aware Model Selection with Bias Detection in + Machine Learning Projects + + +
+ Data science projects often involve various machine learning (ML) methods +that depend on data, code, and models. One of the key activities in these +projects is the selection of a model or algorithm that is appropriate for the +data analysis at hand. ML model selection depends on several factors, which +include data-related attributes such as sample size, functional requirements +such as the prediction algorithm type, and non-functional requirements such as +performance and bias. However, the factors that influence such selection are +often not well understood and explicitly represented. This paper describes +ongoing work on extending an adaptive variability-aware model selection method +with bias detection in ML projects. The method involves: (i) modeling the +variability of the factors that affect model selection using feature models +based on heuristics proposed in the literature; (ii) instantiating our +variability model with added features related to bias (e.g., bias-related +metrics); and (iii) conducting experiments that illustrate the method in a +specific case study to illustrate our approach based on a heart failure +prediction project. The proposed approach aims to advance the state of the art +by making explicit factors that influence model selection, particularly those +related to bias, as well as their interactions. The provided representations +can transform model selection in ML projects into a non ad hoc, adaptive, and +explainable process. + +
+
+ comment: IEEE BigData 2023 +
+
+
+
+
+ + ☆ Annotation Sensitivity: Training Data Collection Methods Affect Model + Performance EMNLP 2023 + + +
+ When training data are collected from human annotators, the design of the +annotation instrument, the instructions given to annotators, the +characteristics of the annotators, and their interactions can impact training +data. This study demonstrates that design choices made when creating an +annotation instrument also impact the models trained on the resulting +annotations. + We introduce the term annotation sensitivity to refer to the impact of +annotation data collection methods on the annotations themselves and on +downstream model performance and predictions. + We collect annotations of hate speech and offensive language in five +experimental conditions of an annotation instrument, randomly assigning +annotators to conditions. We then fine-tune BERT models on each of the five +resulting datasets and evaluate model performance on a holdout portion of each +condition. We find considerable differences between the conditions for 1) the +share of hate speech/offensive language annotations, 2) model performance, 3) +model predictions, and 4) model learning curves. + Our results emphasize the crucial role played by the annotation instrument +which has received little attention in the machine learning literature. We call +for additional research into how and why the instrument impacts the annotations +to inform the development of best practices in instrument design. + +
+
+ comment: EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Enhancing mTBI Diagnosis with Residual Triplet Convolutional Neural + Network Using 3D CT + + +
+ Mild Traumatic Brain Injury (mTBI) is a common and challenging condition to +diagnose accurately. Timely and precise diagnosis is essential for effective +treatment and improved patient outcomes. Traditional diagnostic methods for +mTBI often have limitations in terms of accuracy and sensitivity. In this +study, we introduce an innovative approach to enhance mTBI diagnosis using 3D +Computed Tomography (CT) images and a metric learning technique trained with +triplet loss. To address these challenges, we propose a Residual Triplet +Convolutional Neural Network (RTCNN) model to distinguish between mTBI cases +and healthy ones by embedding 3D CT scans into a feature space. The triplet +loss function maximizes the margin between similar and dissimilar image pairs, +optimizing feature representations. This facilitates better context placement +of individual cases, aids informed decision-making, and has the potential to +improve patient outcomes. Our RTCNN model shows promising performance in mTBI +diagnosis, achieving an average accuracy of 94.3%, a sensitivity of 94.1%, and +a specificity of 95.2%, as confirmed through a five-fold cross-validation. +Importantly, when compared to the conventional Residual Convolutional Neural +Network (RCNN) model, the RTCNN exhibits a significant improvement, showcasing +a remarkable 22.5% increase in specificity, a notable 16.2% boost in accuracy, +and an 11.3% enhancement in sensitivity. Moreover, RTCNN requires lower memory +resources, making it not only highly effective but also resource-efficient in +minimizing false positives while maximizing its diagnostic accuracy in +distinguishing normal CT scans from mTBI cases. The quantitative performance +metrics provided and utilization of occlusion sensitivity maps to visually +explain the model's decision-making process further enhance the +interpretability and transparency of our approach. + +
+
+
+
+
+ + ☆ Touch Analysis: An Empirical Evaluation of Machine Learning + Classification Algorithms on Touch Data + + +
+ Our research aims at classifying individuals based on their unique +interactions on touchscreen-based smartphones. In this research, we use +Touch-Analytics datasets, which include 41 subjects and 30 different behavioral +features. Furthermore, we derived new features from the raw data to improve the +overall authentication performance. Previous research has already been done on +the Touch-Analytics datasets with the state-of-the-art classifiers, including +Support Vector Machine (SVM) and k-nearest neighbor (kNN), and achieved equal +error rates (EERs) between 0% to 4%. Here, we propose a novel Deep Neural Net +(DNN) architecture to classify the individuals correctly. The proposed DNN +architecture has three dense layers and uses many-to-many mapping techniques. +When we combine the new features with the existing ones, SVM and kNN achieved +the classification accuracy of 94.7% and 94.6%, respectively. This research +explored seven other classifiers and out of them, the decision tree and our +proposed DNN classifiers resulted in the highest accuracy of 100%. The others +included: Logistic Regression (LR), Linear Discriminant Analysis (LDA), +Gaussian Naive Bayes (NB), Neural Network, and VGGNet with the following +accuracy scores of 94.7%, 95.9%, 31.9%, 88.8%, and 96.1%, respectively. + +
+
+
+
+
+ + ☆ Gradient-based bilevel optimization for multi-penalty Ridge regression + through matrix differential calculus + + +
+ Common regularization algorithms for linear regression, such as LASSO and +Ridge regression, rely on a regularization hyperparameter that balances the +tradeoff between minimizing the fitting error and the norm of the learned model +coefficients. As this hyperparameter is scalar, it can be easily selected via +random or grid search optimizing a cross-validation criterion. However, using a +scalar hyperparameter limits the algorithm's flexibility and potential for +better generalization. In this paper, we address the problem of linear +regression with l2-regularization, where a different regularization +hyperparameter is associated with each input variable. We optimize these +hyperparameters using a gradient-based approach, wherein the gradient of a +cross-validation criterion with respect to the regularization hyperparameters +is computed analytically through matrix differential calculus. Additionally, we +introduce two strategies tailored for sparse model learning problems aiming at +reducing the risk of overfitting to the validation data. Numerical examples +demonstrate that our multi-hyperparameter regularization approach outperforms +LASSO, Ridge, and Elastic Net regression. Moreover, the analytical computation +of the gradient proves to be more efficient in terms of computational time +compared to automatic differentiation, especially when handling a large number +of input variables. Application to the identification of over-parameterized +Linear Parameter-Varying models is also presented. + +
+
+
+
+
+ + ☆ TCuPGAN: A novel framework developed for optimizing human-machine + interactions in citizen science ECML + + +
+ In the era of big data in scientific research, there is a necessity to +leverage techniques which reduce human effort in labeling and categorizing +large datasets by involving sophisticated machine tools. To combat this +problem, we present a novel, general purpose model for 3D segmentation that +leverages patch-wise adversariality and Long Short-Term Memory to encode +sequential information. Using this model alongside citizen science projects +which use 3D datasets (image cubes) on the Zooniverse platforms, we propose an +iterative human-machine optimization framework where only a fraction of the 2D +slices from these cubes are seen by the volunteers. We leverage the patch-wise +discriminator in our model to provide an estimate of which slices within these +image cubes have poorly generalized feature representations, and +correspondingly poor machine performance. These images with corresponding +machine proposals would be presented to volunteers on Zooniverse for +correction, leading to a drastic reduction in the volunteer effort on citizen +science projects. We trained our model on ~2300 liver tissue 3D electron +micrographs. Lipid droplets were segmented within these images through human +annotation via the `Etch A Cell - Fat Checker' citizen science project, hosted +on the Zooniverse platform. In this work, we demonstrate this framework and the +selection methodology which resulted in a measured reduction in volunteer +effort by more than 60%. We envision this type of joint human-machine +partnership will be of great use on future Zooniverse projects. + +
+
+ comment: 5 pages, 1 figure, accepted for publication at HLDM '23 (ECML PKDD + 2023 workshop) +
+
+
+
+
+ + ☆ Evaluating GPT-4's Vision Capabilities on Brazilian University Admission + Exams + + +
+ Recent advancements in language models have showcased human-comparable +performance in academic entrance exams. However, existing studies often +overlook questions that require the integration of visual comprehension, thus +compromising the full spectrum and complexity inherent in real-world scenarios. +To address this gap, we present a comprehensive framework to evaluate language +models on entrance exams, which incorporates both textual and visual elements. +We evaluate the two most recent editions of Exame Nacional do Ensino M\'edio +(ENEM), the main standardized entrance examination adopted by Brazilian +universities. Our study not only reaffirms the capabilities of GPT-4 as the +state of the art for handling complex multidisciplinary questions, but also +pioneers in offering a realistic assessment of multimodal language models on +Portuguese examinations. One of the highlights is that text captions +transcribing visual content outperform the direct use of images, suggesting +that the vision model has room for improvement. Yet, despite improvements +afforded by images or captions, mathematical questions remain a challenge for +these state-of-the-art models. The code and data used on experiments are +available at https://github.com/piresramon/gpt-4-enem. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.17003 +
+
+
+
+
+ + ☆ Fast Policy Learning for Linear Quadratic Regulator with Entropy + Regularization + + +
+ This paper proposes and analyzes two new policy learning methods: regularized +policy gradient (RPG) and iterative policy optimization (IPO), for a class of +discounted linear-quadratic regulator (LQR) problems over an infinite time +horizon with entropy regularization. Assuming access to the exact policy +evaluation, both proposed approaches are proved to converge linearly in finding +optimal policies of the regularized LQR. Moreover, the IPO method can achieve a +super-linear convergence rate once it enters a local region around the optimal +policy. Finally, when the optimal policy from a well-understood environment in +an RL problem is appropriately transferred as the initial policy to an RL +problem with an unknown environment, the IPO method is shown to enable a +super-linear convergence rate if the latter is sufficiently close to the +former. The performances of these proposed algorithms are supported by +numerical examples. + +
+
+ comment: 33 pages, 3 figures +
+
+
+
+
+ + ☆ Efficient and Robust Jet Tagging at the LHC with Knowledge Distillation NeurIPS 2023 + + +
+ The challenging environment of real-time data processing systems at the Large +Hadron Collider (LHC) strictly limits the computational complexity of +algorithms that can be deployed. For deep learning models, this implies that +only models with low computational complexity that have weak inductive bias are +feasible. To address this issue, we utilize knowledge distillation to leverage +both the performance of large models and the reduced computational complexity +of small ones. In this paper, we present an implementation of knowledge +distillation, demonstrating an overall boost in the student models' performance +for the task of classifying jets at the LHC. Furthermore, by using a teacher +model with a strong inductive bias of Lorentz symmetry, we show that we can +induce the same inductive bias in the student model which leads to better +robustness against arbitrary Lorentz boost. + +
+
+ comment: 7 pages, 3 figures, accepted at the Machine Learning and the Physical + Sciences Workshop, NeurIPS 2023 +
+
+
+
+
+ + ☆ Variational Annealing on Graphs for Combinatorial Optimization NeurIPS 2023 + + +
+ Several recent unsupervised learning methods use probabilistic approaches to +solve combinatorial optimization (CO) problems based on the assumption of +statistically independent solution variables. We demonstrate that this +assumption imposes performance limitations in particular on difficult problem +instances. Our results corroborate that an autoregressive approach which +captures statistical dependencies among solution variables yields superior +performance on many popular CO problems. We introduce subgraph tokenization in +which the configuration of a set of solution variables is represented by a +single token. This tokenization technique alleviates the drawback of the long +sequential sampling procedure which is inherent to autoregressive methods +without sacrificing expressivity. Importantly, we theoretically motivate an +annealed entropy regularization and show empirically that it is essential for +efficient and stable learning. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ☆ Tube-NeRF: Efficient Imitation Learning of Visuomotor Policies from MPC + using Tube-Guided Data Augmentation and NeRFs + + +
+ Imitation learning (IL) can train computationally-efficient sensorimotor +policies from a resource-intensive Model Predictive Controller (MPC), but it +often requires many samples, leading to long training times or limited +robustness. To address these issues, we combine IL with a variant of robust MPC +that accounts for process and sensing uncertainties, and we design a data +augmentation (DA) strategy that enables efficient learning of vision-based +policies. The proposed DA method, named Tube-NeRF, leverages Neural Radiance +Fields (NeRFs) to generate novel synthetic images, and uses properties of the +robust MPC (the tube) to select relevant views and to efficiently compute the +corresponding actions. We tailor our approach to the task of localization and +trajectory tracking on a multirotor, by learning a visuomotor policy that +generates control actions using images from the onboard camera as only source +of horizontal position. Our evaluations numerically demonstrate learning of a +robust visuomotor policy with an 80-fold increase in demonstration efficiency +and a 50% reduction in training time over current IL methods. Additionally, our +policies successfully transfer to a real multirotor, achieving accurate +localization and low tracking errors despite large disturbances, with an +onboard inference time of only 1.5 ms. + +
+
+ comment: Video: https://youtu.be/_W5z33ZK1m4. Evolved paper from our previous + work: arXiv:2210.10127 +
+
+
+
+
+ + ☆ Automated 3D Tumor Segmentation using Temporal Cubic PatchGAN (TCuP-GAN) + + +
+ Development of robust general purpose 3D segmentation frameworks using the +latest deep learning techniques is one of the active topics in various +bio-medical domains. In this work, we introduce Temporal Cubic PatchGAN +(TCuP-GAN), a volume-to-volume translational model that marries the concepts of +a generative feature learning framework with Convolutional Long Short-Term +Memory Networks (LSTMs), for the task of 3D segmentation. We demonstrate the +capabilities of our TCuP-GAN on the data from four segmentation challenges +(Adult Glioma, Meningioma, Pediatric Tumors, and Sub-Saharan Africa subset) +featured within the 2023 Brain Tumor Segmentation (BraTS) Challenge and +quantify its performance using LesionWise Dice similarity and $95\%$ Hausdorff +Distance metrics. We demonstrate the successful learning of our framework to +predict robust multi-class segmentation masks across all the challenges. This +benchmarking work serves as a stepping stone for future efforts towards +applying TCuP-GAN on other multi-class tasks such as multi-organelle +segmentation in electron microscopy imaging. + +
+
+ comment: Submitted as a short paper to the proceedings of the 2023 Brain Tumor + Segmentation (BraTS) Challenge +
+
+
+
+
+ + ☆ Machine Learning For An Explainable Cost Prediction of Medical Insurance + + +
+ Predictive modeling in healthcare continues to be an active actuarial +research topic as more insurance companies aim to maximize the potential of +Machine Learning approaches to increase their productivity and efficiency. In +this paper, the authors deployed three regression-based ensemble ML models that +combine variations of decision trees through Extreme Gradient Boosting, +Gradient-boosting Machine, and Random Forest) methods in predicting medical +insurance costs. Explainable Artificial Intelligence methods SHapley Additive +exPlanations and Individual Conditional Expectation plots were deployed to +discover and explain the key determinant factors that influence medical +insurance premium prices in the dataset. The dataset used comprised 986 records +and is publicly available in the KAGGLE repository. The models were evaluated +using four performance evaluation metrics, including R-squared, Mean Absolute +Error, Root Mean Squared Error, and Mean Absolute Percentage Error. The results +show that all models produced impressive outcomes; however, the XGBoost model +achieved a better overall performance although it also expanded more +computational resources, while the RF model recorded a lesser prediction error +and consumed far fewer computing resources than the XGBoost model. Furthermore, +we compared the outcome of both XAi methods in identifying the key determinant +features that influenced the PremiumPrices for each model and whereas both XAi +methods produced similar outcomes, we found that the ICE plots showed in more +detail the interactions between each variable than the SHAP analysis which +seemed to be more high-level. It is the aim of the authors that the +contributions of this study will help policymakers, insurers, and potential +medical insurance buyers in their decision-making process for selecting the +right policies that meet their specific needs. + +
+
+ comment: 42 pages, 16 figures and 9 tables +
+
+
+
+
+ + ☆ Privacy-Preserving Algorithmic Recourse + + +
+ When individuals are subject to adverse outcomes from machine learning +models, providing a recourse path to help achieve a positive outcome is +desirable. Recent work has shown that counterfactual explanations - which can +be used as a means of single-step recourse - are vulnerable to privacy issues, +putting an individuals' privacy at risk. Providing a sequential multi-step path +for recourse can amplify this risk. Furthermore, simply adding noise to +recourse paths found from existing methods can impact the realism and +actionability of the path for an end-user. In this work, we address privacy +issues when generating realistic recourse paths based on instance-based +counterfactual explanations, and provide PrivRecourse: an end-to-end privacy +preserving pipeline that can provide realistic recourse paths. PrivRecourse +uses differentially private (DP) clustering to represent non-overlapping +subsets of the private dataset. These DP cluster centers are then used to +generate recourse paths by forming a graph with cluster centers as the nodes, +so that we can generate realistic - feasible and actionable - recourse paths. +We empirically evaluate our approach on finance datasets and compare it to +simply adding noise to data instances, and to using DP synthetic data, to +generate the graph. We observe that PrivRecourse can provide paths that are +private and realistic. + +
+
+ comment: Accepted at 3rd International Workshop on Explainable AI in Finance, + ICAIF 2023 +
+
+
+
+
+ + ☆ A Blockchain Solution for Collaborative Machine Learning over IoT + + +
+ The rapid growth of Internet of Things (IoT) devices and applications has led +to an increased demand for advanced analytics and machine learning techniques +capable of handling the challenges associated with data privacy, security, and +scalability. Federated learning (FL) and blockchain technologies have emerged +as promising approaches to address these challenges by enabling decentralized, +secure, and privacy-preserving model training on distributed data sources. In +this paper, we present a novel IoT solution that combines the incremental +learning vector quantization algorithm (XuILVQ) with Ethereum blockchain +technology to facilitate secure and efficient data sharing, model training, and +prototype storage in a distributed environment. Our proposed architecture +addresses the shortcomings of existing blockchain-based FL solutions by +reducing computational and communication overheads while maintaining data +privacy and security. We assess the performance of our system through a series +of experiments, showcasing its potential to enhance the accuracy and efficiency +of machine learning tasks in IoT settings. + +
+
+
+
+
+ + ☆ Exactly conservative physics-informed neural networks and deep operator + networks for dynamical systems + + +
+ We introduce a method for training exactly conservative physics-informed +neural networks and physics-informed deep operator networks for dynamical +systems. The method employs a projection-based technique that maps a candidate +solution learned by the neural network solver for any given dynamical system +possessing at least one first integral onto an invariant manifold. We +illustrate that exactly conservative physics-informed neural network solvers +and physics-informed deep operator networks for dynamical systems vastly +outperform their non-conservative counterparts for several real-world problems +from the mathematical sciences. + +
+
+ comment: 12 pages, 6 figures, 1 algorithm +
+
+
+
+
+ + ☆ Byzantine Robustness and Partial Participation Can Be Achieved + Simultaneously: Just Clip Gradient Differences + + +
+ Distributed learning has emerged as a leading paradigm for training large +machine learning models. However, in real-world scenarios, participants may be +unreliable or malicious, posing a significant challenge to the integrity and +accuracy of the trained models. Byzantine fault tolerance mechanisms have been +proposed to address these issues, but they often assume full participation from +all clients, which is not always practical due to the unavailability of some +clients or communication constraints. In our work, we propose the first +distributed method with client sampling and provable tolerance to Byzantine +workers. The key idea behind the developed method is the use of gradient +clipping to control stochastic gradient differences in recursive variance +reduction. This allows us to bound the potential harm caused by Byzantine +workers, even during iterations when all sampled clients are Byzantine. +Furthermore, we incorporate communication compression into the method to +enhance communication efficiency. Under quite general assumptions, we prove +convergence rates for the proposed method that match the existing +state-of-the-art (SOTA) theoretical results. + +
+
+ comment: 50 pages; 1 figure +
+
+
+
+
+ + ☆ Towards Auditing Large Language Models: Improving Text-based Stereotype + Detection NeurIPS + + +
+ Large Language Models (LLM) have made significant advances in the recent past +becoming more mainstream in Artificial Intelligence (AI) enabled human-facing +applications. However, LLMs often generate stereotypical output inherited from +historical data, amplifying societal biases and raising ethical concerns. This +work introduces i) the Multi-Grain Stereotype Dataset, which includes 52,751 +instances of gender, race, profession and religion stereotypic text and ii) a +novel stereotype classifier for English text. We design several experiments to +rigorously test the proposed model trained on the novel dataset. Our +experiments show that training the model in a multi-class setting can +outperform the one-vs-all binary counterpart. Consistent feature importance +signals from different eXplainable AI tools demonstrate that the new model +exploits relevant text features. We utilise the newly created model to assess +the stereotypic behaviour of the popular GPT family of models and observe the +reduction of bias over time. In summary, our work establishes a robust and +practical framework for auditing and evaluating the stereotypic bias in LLM. + +
+
+ comment: 2023 NeurIPS SoLaR Workshop Accepted +
+
+
+
+
+ + ☆ Scalable AI Safety via Doubly-Efficient Debate + + +
+ The emergence of pre-trained AI systems with powerful capabilities across a +diverse and ever-increasing set of complex domains has raised a critical +challenge for AI safety as tasks can become too complicated for humans to judge +directly. Irving et al. [2018] proposed a debate method in this direction with +the goal of pitting the power of such AI models against each other until the +problem of identifying (mis)-alignment is broken down into a manageable +subtask. While the promise of this approach is clear, the original framework +was based on the assumption that the honest strategy is able to simulate +deterministic AI systems for an exponential number of steps, limiting its +applicability. In this paper, we show how to address these challenges by +designing a new set of debate protocols where the honest strategy can always +succeed using a simulation of a polynomial number of steps, whilst being able +to verify the alignment of stochastic AI systems, even when the dishonest +strategy is allowed to use exponentially many simulation steps. + +
+
+
+
+
+ + ☆ Weight fluctuations in (deep) linear neural networks and a derivation of + the inverse-variance flatness relation + + +
+ We investigate the stationary (late-time) training regime of single- and +two-layer linear neural networks within the continuum limit of stochastic +gradient descent (SGD) for synthetic Gaussian data. In the case of a +single-layer network in the weakly oversampled regime, the spectrum of the +noise covariance matrix deviates notably from the Hessian, which can be +attributed to the broken detailed balance of SGD dynamics. The weight +fluctuations are in this case generally anisotropic, but experience an +isotropic loss. For a two-layer network, we obtain the stochastic dynamics of +the weights in each layer and analyze the associated stationary covariances. We +identify the inter-layer coupling as a new source of anisotropy for the weight +fluctuations. In contrast to the single-layer case, the weight fluctuations +experience an anisotropic loss, the flatness of which is inversely related to +the fluctuation variance. We thereby provide an analytical derivation of the +recently observed inverse variance-flatness relation in a deep linear network +model. + +
+
+ comment: 25 pages, 7 figures +
+
+
+
+
+ + ☆ A density estimation perspective on learning from pairwise human + preferences + + +
+ Learning from human feedback (LHF) -- and in particular learning from +pairwise preferences -- has recently become a crucial ingredient in training +large language models (LLMs), and has been the subject of much research. Most +recent works frame it as a reinforcement learning problem, where a reward +function is learned from pairwise preference data and the LLM is treated as a +policy which is adapted to maximize the rewards, often under additional +regularization constraints. We propose an alternative interpretation which +centers on the generative process for pairwise preferences and treats LHF as a +density estimation problem. We provide theoretical and empirical results +showing that for a family of generative processes defined via preference +behavior distribution equations, training a reward function on pairwise +preferences effectively models an annotator's implicit preference distribution. +Finally, we discuss and present findings on "annotator misspecification" -- +failure cases where wrong modeling assumptions are made about annotator +behavior, resulting in poorly-adapted models -- suggesting that approaches that +learn from pairwise human preferences could have trouble learning from a +population of annotators with diverse viewpoints. + +
+
+
+
+
+ + ☆ SySMOL: A Hardware-software Co-design Framework for Ultra-Low and + Fine-Grained Mixed-Precision Neural Networks + + +
+ Recent advancements in quantization and mixed-precision techniques offer +significant promise for improving the run-time and energy efficiency of neural +networks. In this work, we further showed that neural networks, wherein +individual parameters or activations can take on different precisions ranging +between 1 and 4 bits, can achieve accuracies comparable to or exceeding the +full-precision counterparts. However, the deployment of such networks poses +numerous challenges, stemming from the necessity to manage and control the +compute/communication/storage requirements associated with these extremely +fine-grained mixed precisions for each piece of data. There is a lack of +existing efficient hardware and system-level support tailored to these unique +and challenging requirements. Our research introduces the first novel holistic +hardware-software co-design approach for these networks, which enables a +continuous feedback loop between hardware design, training, and inference to +facilitate systematic design exploration. As a proof-of-concept, we illustrate +this co-design approach by designing new, configurable CPU SIMD architectures +tailored for these networks, tightly integrating the architecture with new +system-aware training and inference techniques. We perform systematic design +space exploration using this framework to analyze various tradeoffs. The design +for mixed-precision networks that achieves optimized tradeoffs corresponds to +an architecture that supports 1, 2, and 4-bit fixed-point operations with four +configurable precision patterns, when coupled with system-aware training and +inference optimization -- networks trained for this design achieve accuracies +that closely match full-precision accuracies, while compressing and improving +run-time efficiency of the neural networks drastically by 10-20x, compared to +full-precision networks. + +
+
+
+
+
+ + ☆ When is Off-Policy Evaluation Useful? A Data-Centric Perspective + + +
+ Evaluating the value of a hypothetical target policy with only a logged +dataset is important but challenging. On the one hand, it brings opportunities +for safe policy improvement under high-stakes scenarios like clinical +guidelines. On the other hand, such opportunities raise a need for precise +off-policy evaluation (OPE). While previous work on OPE focused on improving +the algorithm in value estimation, in this work, we emphasize the importance of +the offline dataset, hence putting forward a data-centric framework for +evaluating OPE problems. We propose DataCOPE, a data-centric framework for +evaluating OPE, that answers the questions of whether and to what extent we can +evaluate a target policy given a dataset. DataCOPE (1) forecasts the overall +performance of OPE algorithms without access to the environment, which is +especially useful before real-world deployment where evaluating OPE is +impossible; (2) identifies the sub-group in the dataset where OPE can be +inaccurate; (3) permits evaluations of datasets or data-collection strategies +for OPE problems. Our empirical analysis of DataCOPE in the logged contextual +bandit settings using healthcare datasets confirms its ability to evaluate both +machine-learning and human expert policies like clinical guidelines. + +
+
+ comment: Off-Policy Evaluation, Data-Centric AI, Data-Centric Reinforcement + Learning, Reinforcement Learning +
+
+
+
+
+ + ☆ MINTY: Rule-based Models that Minimize the Need for Imputing Features + with Missing Values + + +
+ Rule models are often preferred in prediction tasks with tabular inputs as +they can be easily interpreted using natural language and provide predictive +performance on par with more complex models. However, most rule models' +predictions are undefined or ambiguous when some inputs are missing, forcing +users to rely on statistical imputation models or heuristics like zero +imputation, undermining the interpretability of the models. In this work, we +propose fitting concise yet precise rule models that learn to avoid relying on +features with missing values and, therefore, limit their reliance on imputation +at test time. We develop MINTY, a method that learns rules in the form of +disjunctions between variables that act as replacements for each other when one +or more is missing. This results in a sparse linear rule model, regularized to +have small dependence on features with missing values, that allows a trade-off +between goodness of fit, interpretability, and robustness to missing values at +test time. We demonstrate the value of MINTY in experiments using synthetic and +real-world data sets and find its predictive performance comparable or +favorable to baselines, with smaller reliance on features with missing values. + +
+
+
+
+
+ + ☆ Subnetwork Ensembles + + +
+ Neural network ensembles have been effectively used to improve generalization +by combining the predictions of multiple independently trained models. However, +the growing scale and complexity of deep neural networks have led to these +methods becoming prohibitively expensive and time consuming to implement. +Low-cost ensemble methods have become increasingly important as they can +alleviate the need to train multiple models from scratch while retaining the +generalization benefits that traditional ensemble learning methods afford. This +dissertation introduces and formalizes a low-cost framework for constructing +Subnetwork Ensembles, where a collection of child networks are formed by +sampling, perturbing, and optimizing subnetworks from a trained parent model. +We explore several distinct methodologies for generating child networks and we +evaluate their efficacy through a variety of ablation studies and established +benchmarks. Our findings reveal that this approach can greatly improve training +efficiency, parametric utilization, and generalization performance while +minimizing computational cost. Subnetwork Ensembles offer a compelling +framework for exploring how we can build better systems by leveraging the +unrealized potential of deep neural networks. + +
+
+ comment: 116 Pages, 21 figures, Accepted PhD Dissertation +
+
+
+
+
+ + ☆ Robust Decision Aggregation with Second-order Information + + +
+ We consider a decision aggregation problem with two experts who each make a +binary recommendation after observing a private signal about an unknown binary +world state. An agent, who does not know the joint information structure +between signals and states, sees the experts' recommendations and aims to match +the action with the true state. Under the scenario, we study whether +supplemented additionally with second-order information (each expert's forecast +on the other's recommendation) could enable a better aggregation. + We adopt a minimax regret framework to evaluate the aggregator's performance, +by comparing it to an omniscient benchmark that knows the joint information +structure. With general information structures, we show that second-order +information provides no benefit. No aggregator can improve over a trivial +aggregator, which always follows the first expert's recommendation. However, +positive results emerge when we assume experts' signals are conditionally +independent given the world state. When the aggregator is deterministic, we +present a robust aggregator that leverages second-order information, which can +significantly outperform counterparts without it. Second, when two experts are +homogeneous, by adding a non-degenerate assumption on the signals, we +demonstrate that random aggregators using second-order information can surpass +optimal ones without it. In the remaining settings, the second-order +information is not beneficial. We also extend the above results to the setting +when the aggregator's utility function is more general. + +
+
+
+
+
+ + ☆ Class Uncertainty: A Measure to Mitigate Class Imbalance + + +
+ Class-wise characteristics of training examples affect the performance of +deep classifiers. A well-studied example is when the number of training +examples of classes follows a long-tailed distribution, a situation that is +likely to yield sub-optimal performance for under-represented classes. This +class imbalance problem is conventionally addressed by approaches relying on +the class-wise cardinality of training examples, such as data resampling. In +this paper, we demonstrate that considering solely the cardinality of classes +does not cover all issues causing class imbalance. To measure class imbalance, +we propose "Class Uncertainty" as the average predictive uncertainty of the +training examples, and we show that this novel measure captures the differences +across classes better than cardinality. We also curate SVCI-20 as a novel +dataset in which the classes have equal number of training examples but they +differ in terms of their hardness; thereby causing a type of class imbalance +which cannot be addressed by the approaches relying on cardinality. We +incorporate our "Class Uncertainty" measure into a diverse set of ten class +imbalance mitigation methods to demonstrate its effectiveness on long-tailed +datasets as well as on our SVCI-20. Code and datasets will be made available. + +
+
+
+
+
+ + ♻ ☆ Kernel-Based Tests for Likelihood-Free Hypothesis Testing + + +
+ Given $n$ observations from two balanced classes, consider the task of +labeling an additional $m$ inputs that are known to all belong to \emph{one} of +the two classes. Special cases of this problem are well-known: with complete +knowledge of class distributions ($n=\infty$) the problem is solved optimally +by the likelihood-ratio test; when $m=1$ it corresponds to binary +classification; and when $m\approx n$ it is equivalent to two-sample testing. +The intermediate settings occur in the field of likelihood-free inference, +where labeled samples are obtained by running forward simulations and the +unlabeled sample is collected experimentally. In recent work it was discovered +that there is a fundamental trade-off between $m$ and $n$: increasing the data +sample $m$ reduces the amount $n$ of training/simulation data needed. In this +work we (a) introduce a generalization where unlabeled samples come from a +mixture of the two classes -- a case often encountered in practice; (b) study +the minimax sample complexity for non-parametric classes of densities under +\textit{maximum mean discrepancy} (MMD) separation; and (c) investigate the +empirical performance of kernels parameterized by neural networks on two tasks: +detection of the Higgs boson and detection of planted DDPM generated images +amidst CIFAR-10 images. For both problems we confirm the existence of the +theoretically predicted asymmetric $m$ vs $n$ trade-off. + +
+
+ comment: 36 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Automatically Score Tissue Images Like a Pathologist by Transfer + Learning + + +
+ Cancer is the second leading cause of death in the world. Diagnosing cancer +early on can save many lives. Pathologists have to look at tissue microarray +(TMA) images manually to identify tumors, which can be time-consuming, +inconsistent and subjective. Existing automatic algorithms either have not +achieved the accuracy level of a pathologist or require substantial human +involvements. A major challenge is that TMA images with different shapes, +sizes, and locations can have the same score. Learning staining patterns in TMA +images requires a huge number of images, which are severely limited due to +privacy and regulation concerns in medical organizations. TMA images from +different cancer types may share certain common characteristics, but combining +them directly harms the accuracy due to heterogeneity in their staining +patterns. Transfer learning is an emerging learning paradigm that allows +borrowing strength from similar problems. However, existing approaches +typically require a large sample from similar learning problems, while TMA +images of different cancer types are often available in small sample size and +further existing algorithms are limited to transfer learning from one similar +problem. We propose a new transfer learning algorithm that could learn from +multiple related problems, where each problem has a small sample and can have a +substantially different distribution from the original one. The proposed +algorithm has made it possible to break the critical accuracy barrier (the 75% +accuracy level of pathologists), with a reported accuracy of 75.9% on breast +cancer TMA images from the Stanford Tissue Microarray Database. It is supported +by recent developments in transfer learning theory and empirical evidence in +clustering technology. This will allow pathologists to confidently adopt +automatic algorithms in recognizing tumors consistently with a higher accuracy +in real time. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Equivariant flow matching + + +
+ Normalizing flows are a class of deep generative models that are especially +interesting for modeling probability distributions in physics, where the exact +likelihood of flows allows reweighting to known target energy functions and +computing unbiased observables. For instance, Boltzmann generators tackle the +long-standing sampling problem in statistical physics by training flows to +produce equilibrium samples of many-body systems such as small molecules and +proteins. To build effective models for such systems, it is crucial to +incorporate the symmetries of the target energy into the model, which can be +achieved by equivariant continuous normalizing flows (CNFs). However, CNFs can +be computationally expensive to train and generate samples from, which has +hampered their scalability and practical application. In this paper, we +introduce equivariant flow matching, a new training objective for equivariant +CNFs that is based on the recently proposed optimal transport flow matching. +Equivariant flow matching exploits the physical symmetries of the target energy +for efficient, simulation-free training of equivariant CNFs. We demonstrate the +effectiveness of flow matching on rotation and permutation invariant +many-particle systems and a small molecule, alanine dipeptide, where for the +first time we obtain a Boltzmann generator with significant sampling efficiency +without relying on tailored internal coordinate featurization. Our results show +that the equivariant flow matching objective yields flows with shorter +integration paths, improved sampling efficiency, and higher scalability +compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ Are "Hierarchical" Visual Representations Hierarchical? + + +
+ Learned visual representations often capture large amounts of semantic +information for accurate downstream applications. Human understanding of the +world is fundamentally grounded in hierarchy. To mimic this and further improve +representation capabilities, the community has explored "hierarchical" visual +representations that aim at modeling the underlying hierarchy of the visual +world. In this work, we set out to investigate if hierarchical visual +representations truly capture the human perceived hierarchy better than +standard learned representations. To this end, we create HierNet, a suite of 12 +datasets spanning 3 kinds of hierarchy from the BREEDs subset of ImageNet. +After extensive evaluation of Hyperbolic and Matryoshka Representations across +training setups, we conclude that they do not capture hierarchy any better than +the standard representations but can assist in other aspects like search +efficiency and interpretability. Our benchmark and the datasets are +open-sourced at https://github.com/ethanlshen/HierNet. + +
+
+
+
+
+ + ♻ ☆ Designing and evaluating an online reinforcement learning agent for + physical exercise recommendations in N-of-1 trials + + +
+ Personalized adaptive interventions offer the opportunity to increase patient +benefits, however, there are challenges in their planning and implementation. +Once implemented, it is an important question whether personalized adaptive +interventions are indeed clinically more effective compared to a fixed gold +standard intervention. In this paper, we present an innovative N-of-1 trial +study design testing whether implementing a personalized intervention by an +online reinforcement learning agent is feasible and effective. Throughout, we +use a new study on physical exercise recommendations to reduce pain in +endometriosis for illustration. We describe the design of a contextual bandit +recommendation agent and evaluate the agent in simulation studies. The results +show that, first, implementing a personalized intervention by an online +reinforcement learning agent is feasible. Second, such adaptive interventions +have the potential to improve patients' benefits even if only few observations +are available. As one challenge, they add complexity to the design and +implementation process. In order to quantify the expected benefit, data from +previous interventional studies is required. We expect our approach to be +transferable to other interventions and clinical interventions. + +
+
+
+
+
+ + ♻ ☆ Zero Coordinate Shift: Whetted Automatic Differentiation for + Physics-informed Operator Learning + + +
+ Automatic differentiation (AD) is a critical step in physics-informed machine +learning, required for computing the high-order derivatives of network output +w.r.t. coordinates of collocation points. In this paper, we present a novel and +lightweight algorithm to conduct AD for physics-informed operator learning, +which we call the trick of Zero Coordinate Shift (ZCS). Instead of making all +sampled coordinates as leaf variables, ZCS introduces only one scalar-valued +leaf variable for each spatial or temporal dimension, simplifying the wanted +derivatives from "many-roots-many-leaves" to "one-root-many-leaves" whereby +reverse-mode AD becomes directly utilisable. It has led to an outstanding +performance leap by avoiding the duplication of the computational graph along +the dimension of functions (physical parameters). ZCS is easy to implement with +current deep learning libraries; our own implementation is achieved by +extending the DeepXDE package. We carry out a comprehensive benchmark analysis +and several case studies, training physics-informed DeepONets to solve partial +differential equations (PDEs) without data. The results show that ZCS has +persistently reduced GPU memory consumption and wall time for training by an +order of magnitude, and such reduction factor scales with the number of +functions. As a low-level optimisation technique, ZCS imposes no restrictions +on data, physics (PDE) or network architecture and does not compromise training +results from any aspect. + +
+
+ comment: 19 pages; this minor revision gives clearer explanation on the reason + of performance boost by ZCS +
+
+
+
+
+ + ♻ ☆ minimax: Efficient Baselines for Autocurricula in JAX + + +
+ Unsupervised environment design (UED) is a form of automatic curriculum +learning for training robust decision-making agents to zero-shot transfer into +unseen environments. Such autocurricula have received much interest from the RL +community. However, UED experiments, based on CPU rollouts and GPU model +updates, have often required several weeks of training. This compute +requirement is a major obstacle to rapid innovation for the field. This work +introduces the minimax library for UED training on accelerated hardware. Using +JAX to implement fully-tensorized environments and autocurriculum algorithms, +minimax allows the entire training loop to be compiled for hardware +acceleration. To provide a petri dish for rapid experimentation, minimax +includes a tensorized grid-world based on MiniGrid, in addition to reusable +abstractions for conducting autocurricula in procedurally-generated +environments. With these components, minimax provides strong UED baselines, +including new parallelized variants, which achieve over 120$\times$ speedups in +wall time compared to previous implementations when training with equal batch +sizes. The minimax library is available under the Apache 2.0 license at +https://github.com/facebookresearch/minimax. + +
+
+ comment: Presented at ALOE 2023 +
+
+
+
+
+ + ♻ ☆ The SocialAI School: Insights from Developmental Psychology Towards + Artificial Socio-Cultural Agents ICML 2023 + + +
+ Developmental psychologists have long-established the importance of +socio-cognitive abilities in human intelligence. These abilities enable us to +enter, participate and benefit from human culture. AI research on social +interactive agents mostly concerns the emergence of culture in a multi-agent +setting (often without a strong grounding in developmental psychology). We +argue that AI research should be informed by psychology and study +socio-cognitive abilities enabling to enter a culture too. We discuss the +theories of Michael Tomasello and Jerome Bruner to introduce some of their +concepts to AI and outline key concepts and socio-cognitive abilities. We +present The SocialAI school - a tool including a customizable parameterized +uite of procedurally generated environments, which simplifies conducting +experiments regarding those concepts. We show examples of such experiments with +RL agents and Large Language Models. The main motivation of this work is to +engage the AI community around the problem of social intelligence informed by +developmental psychology, and to provide a tool to simplify first steps in this +direction. Refer to the project website for code and additional information: +https://sites.google.com/view/socialai-school. + +
+
+ comment: Preprint, see v1 for a shorter version (accepted at the "Workshop on + Theory-of-Mind" at ICML 2023) See project website for demo and code: + https://sites.google.com/view/socialai-school +
+
+
+
+
+ + ♻ ☆ MUVO: A Multimodal Generative World Model for Autonomous Driving with + Geometric Representations + + +
+ Learning unsupervised world models for autonomous driving has the potential +to improve the reasoning capabilities of today's systems dramatically. However, +most work neglects the physical attributes of the world and focuses on sensor +data alone. We propose MUVO, a MUltimodal World Model with Geometric VOxel +Representations to address this challenge. We utilize raw camera and lidar data +to learn a sensor-agnostic geometric representation of the world, which can +directly be used by downstream tasks, such as planning. We demonstrate +multimodal future predictions and show that our geometric representation +improves the prediction quality of both camera images and lidar point clouds. + +
+
+ comment: Daniel Bogdoll and Yitian Yang contributed equally +
+
+
+
+
+ + ♻ ☆ Evaluating Object (mis)Detection from a Safety and Reliability + Perspective: Discussion and Measures + + +
+ We argue that object detectors in the safety critical domain should +prioritize detection of objects that are most likely to interfere with the +actions of the autonomous actor. Especially, this applies to objects that can +impact the actor's safety and reliability. To quantify the impact of object +(mis)detection on safety and reliability in the context of autonomous driving, +we propose new object detection measures that reward the correct identification +of objects that are most dangerous and most likely to affect driving decisions. +To achieve this, we build an object criticality model to reward the detection +of the objects based on proximity, orientation, and relative velocity with +respect to the subject vehicle. Then, we apply our model on the recent +autonomous driving dataset nuScenes, and we compare nine object detectors. +Results show that, in several settings, object detectors that perform best +according to the nuScenes ranking are not the preferable ones when the focus is +shifted on safety and reliability. + +
+
+ comment: journal version, open access +
+
+
+
+
+ + ♻ ☆ Query-Policy Misalignment in Preference-Based Reinforcement Learning + + +
+ Preference-based reinforcement learning (PbRL) provides a natural way to +align RL agents' behavior with human desired outcomes, but is often restrained +by costly human feedback. To improve feedback efficiency, most existing PbRL +methods focus on selecting queries to maximally improve the overall quality of +the reward model, but counter-intuitively, we find that this may not +necessarily lead to improved performance. To unravel this mystery, we identify +a long-neglected issue in the query selection schemes of existing PbRL studies: +Query-Policy Misalignment. We show that the seemingly informative queries +selected to improve the overall quality of reward model actually may not align +with RL agents' interests, thus offering little help on policy learning and +eventually resulting in poor feedback efficiency. We show that this issue can +be effectively addressed via near on-policy query and a specially designed +hybrid experience replay, which together enforce the bidirectional query-policy +alignment. Simple yet elegant, our method can be easily incorporated into +existing approaches by changing only a few lines of code. We showcase in +comprehensive experiments that our method achieves substantial gains in both +human feedback and RL sample efficiency, demonstrating the importance of +addressing query-policy misalignment in PbRL tasks. + +
+
+
+
+
+ + ♻ ☆ Weighted Joint Maximum Mean Discrepancy Enabled + Multi-Source-Multi-Target Unsupervised Domain Adaptation Fault Diagnosis + + +
+ Despite the remarkable results that can be achieved by data-driven +intelligent fault diagnosis techniques, they presuppose the same distribution +of training and test data as well as sufficient labeled data. Various operating +states often exist in practical scenarios, leading to the problem of domain +shift that hinders the effectiveness of fault diagnosis. While recent +unsupervised domain adaptation methods enable cross-domain fault diagnosis, +they struggle to effectively utilize information from multiple source domains +and achieve effective diagnosis faults in multiple target domains +simultaneously. In this paper, we innovatively proposed a weighted joint +maximum mean discrepancy enabled multi-source-multi-target unsupervised domain +adaptation (WJMMD-MDA), which realizes domain adaptation under +multi-source-multi-target scenarios in the field of fault diagnosis for the +first time. The proposed method extracts sufficient information from multiple +labeled source domains and achieves domain alignment between source and target +domains through an improved weighted distance loss. As a result, +domain-invariant and discriminative features between multiple source and target +domains are learned with cross-domain fault diagnosis realized. The performance +of the proposed method is evaluated in comprehensive comparative experiments on +three datasets, and the experimental results demonstrate the superiority of +this method. + +
+
+
+
+
+ + ♻ ☆ YFlows: Systematic Dataflow Exploration and Code Generation for + Efficient Neural Network Inference using SIMD Architectures on CPUs + + +
+ We address the challenges associated with deploying neural networks on CPUs, +with a particular focus on minimizing inference time while maintaining +accuracy. Our novel approach is to use the dataflow (i.e., computation order) +of a neural network to explore data reuse opportunities using heuristic-guided +analysis and a code generation framework, which enables exploration of various +Single Instruction, Multiple Data (SIMD) implementations to achieve optimized +neural network execution. Our results demonstrate that the dataflow that keeps +outputs in SIMD registers while also maximizing both input and weight reuse +consistently yields the best performance for a wide variety of inference +workloads, achieving up to 3x speedup for 8-bit neural networks, and up to 4.8x +speedup for binary neural networks, respectively, over the optimized +implementations of neural networks today. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ PortfolioMentor: Multimodal Generative AI Companion for Learning and + Crafting Interactive Digital Art Portfolios + + +
+ Digital art portfolios serve as impactful mediums for artists to convey their +visions, weaving together visuals, audio, interactions, and narratives. +However, without technical backgrounds, design students often find it +challenging to translate creative ideas into tangible codes and designs, given +the lack of tailored resources for the non-technical, academic support in art +schools, and a comprehensive guiding tool throughout the mentally demanding +process. Recognizing the role of companionship in code learning and leveraging +generative AI models' capabilities in supporting creative tasks, we present +PortfolioMentor, a coding companion chatbot for IDEs. This tool guides and +collaborates with students through proactive suggestions and responsible Q&As +for learning, inspiration, and support. In detail, the system starts with the +understanding of the task and artist's visions, follows the co-creation of +visual illustrations, audio or music suggestions and files, click-scroll +effects for interactions, and creative vision conceptualization, and finally +synthesizes these facets into a polished interactive digital portfolio. + +
+
+ comment: 3 pages, 1 figure, work in progress +
+
+
+
+
+ + ☆ Electric Network Frequency Optical Sensing Devices + + +
+ Electric Network Frequency (ENF) acts as a fingerprint in multimedia +forensics applications. In indoor environments, ENF variations affect the +intensity of light sources connected to power mains. Accordingly, the light +intensity variations captured by sensing devices can be exploited to estimate +the ENF. A first optical sensing device based on a photodiode is developed for +capturing ENF variations in indoor lighting environments. In addition, a device +that captures the ENF directly from power mains is implemented. This device +serves as a ground truth ENF collector. Video recordings captured by a camera +are also employed to estimate the ENF. The camera serves as a second optical +sensor. The factors affecting the ENF estimation are thoroughly studied. The +maximum correlation coefficient between the ENF estimated by the two optical +sensors and that estimated directly from power mains is used to measure the +estimation accuracy. The paper's major contribution is in the disclosure of +extensive experimental evidence on ENF estimation in scenes ranging from static +ones capturing a white wall to non-static ones, including human activity. + +
+
+
+
+
+ + ☆ Weakly-Supervised Video Moment Retrieval via Regularized Two-Branch + Proposal Networks with Erasing Mechanism + + +
+ Video moment retrieval is to identify the target moment according to the +given sentence in an untrimmed video. Due to temporal boundary annotations of +the video are extremely time-consuming to acquire, modeling in the +weakly-supervised setting is increasingly focused, where we only have access to +the video-sentence pairs during training. Most existing weakly-supervised +methods adopt a MIL-based framework to develop inter-sample confrontment, but +neglect the intra-sample confrontment between moments with similar semantics. +Therefore, these methods fail to distinguish the correct moment from plausible +negative moments. Further, the previous attention models in cross-modal +interaction tend to focus on a few dominant words exorbitantly, ignoring the +comprehensive video-sentence correspondence. In this paper, we propose a novel +Regularized Two-Branch Proposal Network with Erasing Mechanism to consider the +inter-sample and intra-sample confrontments simultaneously. Concretely, we +first devise a language-aware visual filter to generate both enhanced and +suppressed video streams. Then, we design the sharable two-branch proposal +module to generate positive and plausible negative proposals from the enhanced +and suppressed branch respectively, contributing to sufficient confrontment. +Besides, we introduce an attention-guided dynamic erasing mechanism in enhanced +branch to discover the complementary video-sentence relation. Moreover, we +apply two types of proposal regularization to stabilize the training process +and improve model performance. The extensive experiments on ActivityCaption, +Charades-STA and DiDeMo datasets show the effectiveness of our method. + +
+
+
+
+
+ + ☆ Archiving Body Movements: Collective Generation of Chinese Calligraphy + + +
+ As a communication channel, body movements have been widely explored in +behavioral studies and kinesics. Performing and visual arts share the same +interests but focus on documenting and representing human body movements, such +as for dance notation and visual work creation. This paper investigates body +movements in oriental calligraphy and how to apply calligraphy principles to +stimulate and archive body movements. Through an artwork (Wushu), the authors +experiment with an interactive and generative approach to engage the audience's +bodily participation and archive the body movements as a compendium of +generated calligraphy. The audience assumes the role of both writers and +readers; creating ("writing") and appreciating ("reading") the generated +calligraphy becomes a cyclical process within this infinite "Book," which can +motivate further attention and discussions concerning Chinese characters and +calligraphy. + +
+
+ comment: 8 pages, 8 figures +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 45 + +
+
+
+ + ☆ PaSS: Parallel Speculative Sampling SP + + +
+ Scaling the size of language models to tens of billions of parameters has led +to impressive performance on a wide range of tasks. At generation, these models +are used auto-regressively, requiring a forward pass for each generated token, +and thus reading the full set of parameters from memory. This memory access +forms the primary bottleneck for generation and it worsens as the model size +increases. Moreover, executing a forward pass for multiple tokens in parallel +often takes nearly the same time as it does for just one token. These two +observations lead to the development of speculative sampling, where a second +smaller model is used to draft a few tokens, that are then validated or +rejected using a single forward pass of the large model. Unfortunately, this +method requires two models that share the same tokenizer and thus limits its +adoption. As an alternative, we propose to use parallel decoding as a way to +draft multiple tokens from a single model with no computational cost, nor the +need for a second model. Our approach only requires an additional input token +that marks the words that will be generated simultaneously. We show promising +performance (up to $30\%$ speed-up) while requiring only as few as $O(d_{emb})$ +additional parameters. + +
+
+ comment: Accepted at the 3rd workshop on Efficient Natural Language and Speech + Processing (ENLSP, NeurIPS 2023) +
+
+
+
+
+ + ☆ Drilling Down into the Discourse Structure with LLMs for Long Document + Question Answering EMNLP 2023 + + +
+ We address the task of evidence retrieval for long document question +answering, which involves locating relevant paragraphs within a document to +answer a question. We aim to assess the applicability of large language models +(LLMs) in the task of zero-shot long document evidence retrieval, owing to +their unprecedented performance across various NLP tasks. However, currently +the LLMs can consume limited context lengths as input, thus providing document +chunks as inputs might overlook the global context while missing out on +capturing the inter-segment dependencies. Moreover, directly feeding the large +input sets can incur significant computational costs, particularly when +processing the entire document (and potentially incurring monetary expenses +with enterprise APIs like OpenAI's GPT variants). To address these challenges, +we propose a suite of techniques that exploit the discourse structure commonly +found in documents. By utilizing this structure, we create a condensed +representation of the document, enabling a more comprehensive understanding and +analysis of relationships between different parts. We retain $99.6\%$ of the +best zero-shot approach's performance, while processing only $26\%$ of the +total tokens used by the best approach in the information seeking evidence +retrieval setup. We also show how our approach can be combined with +\textit{self-ask} reasoning agent to achieve best zero-shot performance in +complex multi-hop question answering, just $\approx 4\%$ short of zero-shot +performance using gold evidence. + +
+
+ comment: Accepted to the Findings of EMNLP 2023 +
+
+
+
+
+ + ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose a novel method which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging (namely LM-Cocktail), where the fine-tuned language model +is merged with the pre-trained base model or the peer models from other domains +through weighted average. Despite simplicity, LM-Cocktail is surprisingly +effective: the resulted model is able to achieve a strong empirical performance +in the whole scope of general tasks while preserving a superior capacity in its +targeted domain. We conduct comprehensive experiments with LLama and BGE model +on popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+
+
+
+ + ☆ Current Topological and Machine Learning Applications for Bias Detection + in Text + + +
+ Institutional bias can impact patient outcomes, educational attainment, and +legal system navigation. Written records often reflect bias, and once bias is +identified; it is possible to refer individuals for training to reduce bias. +Many machine learning tools exist to explore text data and create predictive +models that can search written records to identify real-time bias. However, few +previous studies investigate large language model embeddings and geometric +models of biased text data to understand geometry's impact on bias modeling +accuracy. To overcome this issue, this study utilizes the RedditBias database +to analyze textual biases. Four transformer models, including BERT and RoBERTa +variants, were explored. Post-embedding, t-SNE allowed two-dimensional +visualization of data. KNN classifiers differentiated bias types, with lower +k-values proving more effective. Findings suggest BERT, particularly mini BERT, +excels in bias classification, while multilingual models lag. The +recommendation emphasizes refining monolingual models and exploring +domain-specific biases. + +
+
+
+
+
+ + ☆ Machine Translation to Control Formality Features in the Target Language + + +
+ Formality plays a significant role in language communication, especially in +low-resource languages such as Hindi, Japanese and Korean. These languages +utilise formal and informal expressions to convey messages based on social +contexts and relationships. When a language translation technique is used to +translate from a source language that does not pertain the formality (e.g. +English) to a target language that does, there is a missing information on +formality that could be a challenge in producing an accurate outcome. This +research explores how this issue should be resolved when machine learning +methods are used to translate from English to languages with formality, using +Hindi as the example data. This was done by training a bilingual model in a +formality-controlled setting and comparing its performance with a pre-trained +multilingual model in a similar setting. Since there are not a lot of training +data with ground truth, automated annotation techniques were employed to +increase the data size. The primary modeling approach involved leveraging +transformer models, which have demonstrated effectiveness in various natural +language processing tasks. We evaluate the official formality accuracy(ACC) by +comparing the predicted masked tokens with the ground truth. This metric +provides a quantitative measure of how well the translations align with the +desired outputs. Our study showcases a versatile translation strategy that +considers the nuances of formality in the target language, catering to diverse +language communication needs and scenarios. + +
+
+ comment: 9 pages, based on DCU MCM Practicum 2022/2023 +
+
+
+
+
+ + ☆ Complexity-Guided Curriculum Learning for Text Graphs EMNLP 2023 + + +
+ Curriculum learning provides a systematic approach to training. It refines +training progressively, tailors training to task requirements, and improves +generalization through exposure to diverse examples. We present a curriculum +learning approach that builds on existing knowledge about text and graph +complexity formalisms for training with text graph data. The core part of our +approach is a novel data scheduler, which employs "spaced repetition" and +complexity formalisms to guide the training process. We demonstrate the +effectiveness of the proposed approach on several text graph tasks and graph +neural network architectures. The proposed model gains more and uses less data; +consistently prefers text over graph complexity indices throughout training, +while the best curricula derived from text and graph complexity indices are +equally effective; and it learns transferable curricula across GNN models and +datasets. In addition, we find that both node-level (local) and graph-level +(global) graph complexity indices, as well as shallow and traditional text +complexity indices play a crucial role in effective curriculum learning. + +
+
+ comment: Long Paper Accepted at EMNLP 2023 +
+
+
+
+
+ + ☆ Generation of Explanations for Logic Reasoning + + +
+ This thesis delves into a fortiori arguments in deductive reasoning, +underscoring their relevance in various domains such as law, philosophy, and +artificial intelligence. The research is centred on employing GPT-3.5-turbo to +automate the analysis of these arguments, with a focus on understanding +intricate reasoning processes, generating clear and coherent explanations, and +creating novel arguments. The methodology encompasses a series of tasks +including detailed reasoning, interpretation, and the augmentation of a +fortiori arguments. It involves meticulously identifying these arguments in +diverse contexts, differentiating comparative elements, and categorizing them +based on their logical structure. + Extensive experiments reveals the challenges encountered by GPT-3.5-turbo in +accurately detecting and classifying a fortiori arguments. Nevertheless, the +model demonstrates a performance that rivals specialized models, particularly +in extracting key components and interpreting underlying properties. The +integration of external information into the model's processing significantly +elevates the quality of the generated explanations. Additionally, the model +exhibits a noteworthy capability in augmenting arguments, thus contributing to +the enrichment of the data set. + Despite facing certain limitations, this thesis makes significant +contributions to the fields of artificial intelligence and logical reasoning. +It introduces novel methodologies, establishes a rigorous evaluation framework, +and provides deep insights that set the stage for future advancements in +automated logical reasoning. The findings and methodologies presented herein +not only underscore the potential of AI in complex reasoning tasks but also +highlight areas for future research and development. + +
+
+ comment: 78 Pages, 16 Figures, Thesis Presentation is available at + https://drive.google.com/file/d/1wLIBsjfLvO11PjCS6qx4Y9UgRBUfq3wQ/view?usp=sharing +
+
+
+
+
+ + ☆ Fact-based Court Judgment Prediction + + +
+ This extended abstract extends the research presented in "ILDC for CJPE: +Indian Legal Documents Corpus for Court Judgment Prediction and Explanation" +\cite{malik-etal-2021-ildc}, focusing on fact-based judgment prediction within +the context of Indian legal documents. We introduce two distinct problem +variations: one based solely on facts, and another combining facts with rulings +from lower courts (RLC). Our research aims to enhance early-phase case outcome +prediction, offering significant benefits to legal professionals and the +general public. The results, however, indicated a performance decline compared +to the original ILDC for CJPE study, even after implementing various weightage +schemes in our DELSumm algorithm. Additionally, using only facts for legal +judgment prediction with different transformer models yielded results inferior +to the state-of-the-art outcomes reported in the "ILDC for CJPE" study. + +
+
+
+
+
+ + ☆ Mitigating Large Language Model Hallucinations via Autonomous Knowledge + Graph-based Retrofitting + + +
+ Incorporating factual knowledge in knowledge graph is regarded as a promising +approach for mitigating the hallucination of large language models (LLMs). +Existing methods usually only use the user's input to query the knowledge +graph, thus failing to address the factual hallucination generated by LLMs +during its reasoning process. To address this problem, this paper proposes +Knowledge Graph-based Retrofitting (KGR), a new framework that incorporates +LLMs with KGs to mitigate factual hallucination during the reasoning process by +retrofitting the initial draft responses of LLMs based on the factual knowledge +stored in KGs. Specifically, KGR leverages LLMs to extract, select, validate, +and retrofit factual statements within the model-generated responses, which +enables an autonomous knowledge verifying and refining procedure without any +additional manual efforts. Experiments show that KGR can significantly improve +the performance of LLMs on factual QA benchmarks especially when involving +complex reasoning processes, which demonstrates the necessity and effectiveness +of KGR in mitigating hallucination and enhancing the reliability of LLMs. + +
+
+
+
+
+ + ☆ Rethinking Radiology Report Generation via Causal Reasoning and + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as an interaction between +vision and language fields. Previous works inherited the ideology of +vision-to-language generation tasks,aiming to generate paragraphs with high +consistency as reports. However, one unique characteristic of RRG, the +independence between diseases, was neglected, leading to the injection of the +spurious confounder, i.e., the disease co-occurrence. Unfortunately, this +confounder confuses the process of report generation worse because of the +biased RRG data distribution. In this paper, to rethink this issue thoroughly, +we reason about its causes and effects from a novel perspective of statistics +and causality, where the Joint Vision Coupling and the Conditional Sentence +Coherence Coupling are two aspects prone to implicitly decrease the accuracy of +reports. Then, a counterfactual augmentation strategy that contains the +Counterfactual Sample Synthesis and the Counterfactual Report Reconstruction +sub-methods is proposed to break these two aspects of spurious effects. +Experimental results and further analyses on two widely used datasets justify +our reasoning and proposed methods. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ☆ Intention and Context Elicitation with Large Language Models in the + Legal Aid Intake Process + + +
+ Large Language Models (LLMs) and chatbots show significant promise in +streamlining the legal intake process. This advancement can greatly reduce the +workload and costs for legal aid organizations, improving availability while +making legal assistance more accessible to a broader audience. However, a key +challenge with current LLMs is their tendency to overconfidently deliver an +immediate 'best guess' to a client's question based on the output distribution +learned over the training data. This approach often overlooks the client's +actual intentions or the specifics of their legal situation. As a result, +clients may not realize the importance of providing essential additional +context or expressing their underlying intentions, which are crucial for their +legal cases. Traditionally, logic based decision trees have been used to +automate intake for specific access to justice issues, such as immigration and +eviction. But those solutions lack scalability. We demonstrate a +proof-of-concept using LLMs to elicit and infer clients' underlying intentions +and specific legal circumstances through free-form, language-based +interactions. We also propose future research directions to use supervised +fine-tuning or offline reinforcement learning to automatically incorporate +intention and context elicitation in chatbots without explicit prompting. + +
+
+
+
+
+ + ☆ Enhancing Summarization Performance through Transformer-Based Prompt + Engineering in Automated Medical Reporting + + +
+ Customized medical prompts enable Large Language Models (LLM) to effectively +address medical dialogue summarization. The process of medical reporting is +often time-consuming for healthcare professionals. Implementing medical +dialogue summarization techniques presents a viable solution to alleviate this +time constraint by generating automated medical reports. The effectiveness of +LLMs in this process is significantly influenced by the formulation of the +prompt, which plays a crucial role in determining the quality and relevance of +the generated reports. In this research, we used a combination of two distinct +prompting strategies, known as shot prompting and pattern prompting to enhance +the performance of automated medical reporting. The evaluation of the automated +medical reports is carried out using the ROUGE score and a human evaluation +with the help of an expert panel. The two-shot prompting approach in +combination with scope and domain context outperforms other methods and +achieves the highest score when compared to the human reference set by a +general practitioner. However, the automated reports are approximately twice as +long as the human references, due to the addition of both redundant and +relevant statements that are added to the report. + +
+
+ comment: 12 pages, 4 figures, submitted to Healthinf 2024, author roles: + research conducted and written by Daphne van Zandvoort and Laura Wiersema, + research suggested and used software created by Tom Huibers, data provided + and feedback provided by Sandra van Dulmen, supervision and feedback provided + by Sjaak Brinkkemper +
+
+
+
+
+ + ☆ Comparative Experimentation of Accuracy Metrics in Automated Medical + Reporting: The Case of Otitis Consultations ALT + + +
+ Generative Artificial Intelligence (AI) can be used to automatically generate +medical reports based on transcripts of medical consultations. The aim is to +reduce the administrative burden that healthcare professionals face. The +accuracy of the generated reports needs to be established to ensure their +correctness and usefulness. There are several metrics for measuring the +accuracy of AI generated reports, but little work has been done towards the +application of these metrics in medical reporting. A comparative +experimentation of 10 accuracy metrics has been performed on AI generated +medical reports against their corresponding General Practitioner's (GP) medical +reports concerning Otitis consultations. The number of missing, incorrect, and +additional statements of the generated reports have been correlated with the +metric scores. In addition, we introduce and define a Composite Accuracy Score +which produces a single score for comparing the metrics within the field of +automated medical reporting. Findings show that based on the correlation study +and the Composite Accuracy Score, the ROUGE-L and Word Mover's Distance metrics +are the preferred metrics, which is not in line with previous work. These +findings help determine the accuracy of an AI generated medical report, which +aids the development of systems that generate medical reports for GPs to reduce +the administrative burden. + +
+
+ comment: 10 pages, 1 figure, submitted to HEALTHINF 2024, Author + contributions: Wouter Faber and Renske Eline Bootsma performed research and + wrote paper, Tom Huibers provided needed software and research inspiration, + Sandra van Dulmen provided the data and feedback on paper, Sjaak Brinkkemper + supervised the project and provided continuous feedback +
+
+
+
+
+ + ☆ ViStruct: Visual Structural Knowledge Extraction via Curriculum Guided + Code-Vision Representation EMNLP 2023 + + +
+ State-of-the-art vision-language models (VLMs) still have limited performance +in structural knowledge extraction, such as relations between objects. In this +work, we present ViStruct, a training framework to learn VLMs for effective +visual structural knowledge extraction. Two novel designs are incorporated. +First, we propose to leverage the inherent structure of programming language to +depict visual structural information. This approach enables explicit and +consistent representation of visual structural information of multiple +granularities, such as concepts, relations, and events, in a well-organized +structured format. Second, we introduce curriculum-based learning for VLMs to +progressively comprehend visual structures, from fundamental visual concepts to +intricate event structures. Our intuition is that lower-level knowledge may +contribute to complex visual structure understanding. Furthermore, we compile +and release a collection of datasets tailored for visual structural knowledge +extraction. We adopt a weakly-supervised approach to directly generate visual +event structures from captions for ViStruct training, capitalizing on abundant +image-caption pairs from the web. In experiments, we evaluate ViStruct on +visual structure prediction tasks, demonstrating its effectiveness in improving +the understanding of visual structures. The code is public at +\url{https://github.com/Yangyi-Chen/vi-struct}. + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ☆ Automatic Instruction Optimization for Open-source LLM Instruction + Tuning + + +
+ Instruction tuning is crucial for enabling Language Learning Models (LLMs) in +responding to human instructions. The quality of instruction pairs used for +tuning greatly affects the performance of LLMs. However, the manual creation of +high-quality instruction datasets is costly, leading to the adoption of +automatic generation of instruction pairs by LLMs as a popular alternative in +the training of open-source LLMs. To ensure the high quality of LLM-generated +instruction datasets, several approaches have been proposed. Nevertheless, +existing methods either compromise dataset integrity by filtering a large +proportion of samples, or are unsuitable for industrial applications. In this +paper, instead of discarding low-quality samples, we propose CoachLM, a novel +approach to enhance the quality of instruction datasets through automatic +revisions on samples in the dataset. CoachLM is trained from the samples +revised by human experts and significantly increases the proportion of +high-quality samples in the dataset from 17.7% to 78.9%. The effectiveness of +CoachLM is further assessed on various real-world instruction test sets. The +results show that CoachLM improves the instruction-following capabilities of +the instruction-tuned LLM by an average of 29.9%, which even surpasses larger +LLMs with nearly twice the number of parameters. Furthermore, CoachLM is +successfully deployed in a data management system for LLMs at Huawei, resulting +in an efficiency improvement of up to 20% in the cleaning of 40k real-world +instruction pairs. We release the training data and code of CoachLM +(https://github.com/lunyiliu/CoachLM). + +
+
+
+
+
+ + ☆ On the Calibration of Large Language Models and Alignment EMNLP-2023 + + +
+ As large language models attract increasing attention and find widespread +application, concurrent challenges of reliability also arise at the same time. +Confidence calibration, an effective analysis method for gauging the +reliability of deep models, serves as a crucial tool for assessing and +improving their reliability. However, such investigation has been comparatively +underexplored. In this work, we conduct a systematic examination of the +calibration of aligned language models throughout the entire construction +process, including pretraining and alignment training. At each stage, we +investigate how different training settings, such as parameter scales and +training data, affect model calibration. To thoroughly assess model +calibration, we evaluate models on three most concerned aspects: generation, +factuality and understanding. Our work sheds light on whether popular LLMs are +well-calibrated and how the training process influences model calibration. + +
+
+ comment: to be published in findings of EMNLP-2023 +
+
+
+
+
+ + ☆ Enhancing Uncertainty-Based Hallucination Detection with Stronger Focus EMNLP 2023 + + +
+ Large Language Models (LLMs) have gained significant popularity for their +impressive performance across diverse fields. However, LLMs are prone to +hallucinate untruthful or nonsensical outputs that fail to meet user +expectations in many real-world applications. Existing works for detecting +hallucinations in LLMs either rely on external knowledge for reference +retrieval or require sampling multiple responses from the LLM for consistency +verification, making these methods costly and inefficient. In this paper, we +propose a novel reference-free, uncertainty-based method for detecting +hallucinations in LLMs. Our approach imitates human focus in factuality +checking from three aspects: 1) focus on the most informative and important +keywords in the given text; 2) focus on the unreliable tokens in historical +context which may lead to a cascade of hallucinations; and 3) focus on the +token properties such as token type and token frequency. Experimental results +on relevant datasets demonstrate the effectiveness of our proposed method, +which achieves state-of-the-art performance across all the evaluation metrics +and eliminates the need for additional information. + +
+
+ comment: Accepted by EMNLP 2023 (main conference) +
+
+
+
+
+ + ☆ AS-LLM: When Algorithm Selection Meets Large Language Model + + +
+ Algorithm selection aims to identify the most suitable algorithm for solving +a specific problem before execution, which has become a critical process of the +AutoML. Current mainstream algorithm selection techniques rely heavily on +feature representations of various problems and employ the performance of each +algorithm as supervised information. However, there is a significant research +gap concerning the consideration of algorithm features. This gap is primarily +attributed to the inherent complexity of algorithms, making it particularly +challenging to find a universally effective feature extraction method that is +applicable across a diverse range of algorithms. Unfortunately, neglecting this +aspect undoubtedly impacts the accuracy of algorithm selection and indirectly +necessitates an increased volume of problem data for training purposes. This +paper takes a significant stride towards addressing this gap by proposing an +approach that integrates algorithm representation into the algorithm selection +process. Specifically, our proposed model employs distinct modules to extract +representations of both problems and algorithms, where the algorithm +representation leverages the capabilities of pre-trained LLMs in the realm of +code comprehension. Following the extraction of embedding vectors for both +algorithms and problems, the most suitable algorithm is determined through +calculations of matching degrees. Our experiments not only validate the +effectiveness of the proposed model but also showcase the performance of +different embedded pre-trained LLMs, which suggests that the proposed algorithm +selection framework holds the potential to serve as a baseline task for +evaluating the code representation capabilities of LLMs. + +
+
+
+
+
+ + ☆ ComPEFT: Compression for Communicating Parameter Efficient Updates via + Sparsification and Quantization + + +
+ Parameter-efficient fine-tuning (PEFT) techniques make it possible to +efficiently adapt a language model to create "expert" models that specialize to +new tasks or domains. Recent techniques in model merging and compositional +generalization leverage these expert models by dynamically composing modules to +improve zero/few-shot generalization. Despite the efficiency of PEFT methods, +the size of expert models can make it onerous to retrieve expert models per +query over high-latency networks like the Internet or serve multiple experts on +a single GPU. To address these issues, we present ComPEFT, a novel method for +compressing fine-tuning residuals (task vectors) of PEFT based models. ComPEFT +employs sparsification and ternary quantization to reduce the size of the PEFT +module without performing any additional retraining while preserving or +enhancing model performance. In extensive evaluation across T5, T0, and +LLaMA-based models with 200M - 65B parameters, ComPEFT achieves compression +ratios of 8x - 50x. In particular, we show that ComPEFT improves with scale - +stronger models exhibit higher compressibility and better performance. For +example, we show that ComPEFT applied to LLaMA outperforms QLoRA by 4.16% on +MMLU with a storage size reduction of up to 26x. In addition, we show that the +compressed experts produced by ComPEFT maintain few-shot compositional +generalization capabilities, facilitate efficient communication and +computation, and exhibit enhanced performance when merged. Lastly, we provide +an analysis of different method components, compare it with other PEFT methods, +and test ComPEFT's efficacy for compressing the residual of full-finetuning. +Our code is available at https://github.com/prateeky2806/compeft. + +
+
+ comment: 25 Pages, 6 Figures, 16 Tables +
+
+
+
+
+ + ☆ LIMIT: Less Is More for Instruction Tuning Across Evaluation Paradigms NeurIPS 2023 + + +
+ Large Language Models are traditionally finetuned on large instruction +datasets. However recent studies suggest that small, high-quality datasets can +suffice for general purpose instruction following. This lack of consensus +surrounding finetuning best practices is in part due to rapidly diverging +approaches to LLM evaluation. In this study, we ask whether a small amount of +diverse finetuning samples can improve performance on both traditional +perplexity-based NLP benchmarks, and on open-ended, model-based evaluation. We +finetune open-source MPT-7B and MPT-30B models on instruction finetuning +datasets of various sizes ranging from 1k to 60k samples. We find that subsets +of 1k-6k instruction finetuning samples are sufficient to achieve good +performance on both (1) traditional NLP benchmarks and (2) model-based +evaluation. Finally, we show that mixing textbook-style and open-ended QA +finetuning datasets optimizes performance on both evaluation paradigms. + +
+
+ comment: 36 pages, 12 figures, NeurIPS 2023 Workshop on Instruction Tuning and + Instruction Following +
+
+
+
+
+ + ☆ Towards Better Parameter-Efficient Fine-Tuning for Large Language + Models: A Position Paper + + +
+ This paper delves into the pressing need in Parameter-Efficient Fine-Tuning +(PEFT) for Large Language Models (LLMs). While LLMs possess remarkable +capabilities, their extensive parameter requirements and associated +computational demands hinder their practicality and scalability for real-world +applications. Our position paper highlights current states and the necessity of +further studying into the topic, and recognizes significant challenges and open +issues that must be addressed to fully harness the powerful abilities of LLMs. +These challenges encompass novel efficient PEFT architectures, PEFT for +different learning settings, PEFT combined with model compression techniques, +and the exploration of PEFT for multi-modal LLMs. By presenting this position +paper, we aim to stimulate further research and foster discussions surrounding +more efficient and accessible PEFT for LLMs. + +
+
+
+
+
+ + ☆ Combatting Human Trafficking in the Cyberspace: A Natural Language + Processing-Based Methodology to Analyze the Language in Online Advertisements + + +
+ This project tackles the pressing issue of human trafficking in online C2C +marketplaces through advanced Natural Language Processing (NLP) techniques. We +introduce a novel methodology for generating pseudo-labeled datasets with +minimal supervision, serving as a rich resource for training state-of-the-art +NLP models. Focusing on tasks like Human Trafficking Risk Prediction (HTRP) and +Organized Activity Detection (OAD), we employ cutting-edge Transformer models +for analysis. A key contribution is the implementation of an interpretability +framework using Integrated Gradients, providing explainable insights crucial +for law enforcement. This work not only fills a critical gap in the literature +but also offers a scalable, machine learning-driven approach to combat human +exploitation online. It serves as a foundation for future research and +practical applications, emphasizing the role of machine learning in addressing +complex social issues. + +
+
+
+
+
+ + ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: This paper integrates the works arXiv:2306.01129 and + arXiv:2308.16271, as well as this under-review work: + https://openreview.net/forum?id=PvyOYleymy into a complete story. In this + paper, we improve the writing and organization, and also add conceptual, + empirical, and theoretical improvements over the previous work +
+
+
+
+
+ + ☆ Perceptual Structure in the Absence of Grounding for LLMs: The Impact of + Abstractedness and Subjectivity in Color Language EMNLP 2023 + + +
+ The need for grounding in language understanding is an active research topic. +Previous work has suggested that color perception and color language appear as +a suitable test bed to empirically study the problem, given its cognitive +significance and showing that there is considerable alignment between a defined +color space and the feature space defined by a language model. To further study +this issue, we collect a large scale source of colors and their descriptions, +containing almost a 1 million examples , and perform an empirical analysis to +compare two kinds of alignments: (i) inter-space, by learning a mapping between +embedding space and color space, and (ii) intra-space, by means of prompting +comparatives between color descriptions. Our results show that while color +space alignment holds for monolexemic, highly pragmatic color descriptions, +this alignment drops considerably in the presence of examples that exhibit +elements of real linguistic usage such as subjectivity and abstractedness, +suggesting that grounding may be required in such cases. + +
+
+ comment: EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Detecting out-of-distribution text using topological features of + transformer-based language models + + +
+ We attempt to detect out-of-distribution (OOD) text samples though applying +Topological Data Analysis (TDA) to attention maps in transformer-based language +models. We evaluate our proposed TDA-based approach for out-of-distribution +detection on BERT, a transformer-based language model, and compare the to a +more traditional OOD approach based on BERT CLS embeddings. We found that our +TDA approach outperforms the CLS embedding approach at distinguishing +in-distribution data (politics and entertainment news articles from HuffPost) +from far out-of-domain samples (IMDB reviews), but its effectiveness +deteriorates with near out-of-domain (CNN/Dailymail) or same-domain (business +news articles from HuffPost) datasets. + +
+
+ comment: 12 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Enhancing Logical Reasoning in Large Language Models to Facilitate Legal + Applications + + +
+ Language serves as a vehicle for conveying thought, enabling communication +among individuals. The ability to distinguish between diverse concepts, +identify fairness and injustice, and comprehend a range of legal notions +fundamentally relies on logical reasoning. Large Language Models (LLMs) attempt +to emulate human language understanding and generation, but their competency in +logical reasoning remains limited. This paper seeks to address the +philosophical question: How can we effectively teach logical reasoning to LLMs +while maintaining a deep understanding of the intricate relationship between +language and logic? By focusing on bolstering LLMs' capabilities in logical +reasoning, we aim to expand their applicability in law and other +logic-intensive disciplines. To this end, we propose a Reinforcement Learning +from Logical Feedback (RLLF) approach, which serves as a potential framework +for refining LLMs' reasoning capacities. Through RLLF and a revised evaluation +methodology, we explore new avenues for research in this domain and contribute +to the development of LLMs capable of handling complex legal reasoning tasks +while acknowledging the fundamental connection between language and logic. + +
+
+ comment: ALP@JURIX2023 +
+
+
+
+
+ + ☆ Surpassing GPT-4 Medical Coding with a Two-Stage Approach ML4H + + +
+ Recent advances in large language models (LLMs) show potential for clinical +applications, such as clinical decision support and trial recommendations. +However, the GPT-4 LLM predicts an excessive number of ICD codes for medical +coding tasks, leading to high recall but low precision. To tackle this +challenge, we introduce LLM-codex, a two-stage approach to predict ICD codes +that first generates evidence proposals using an LLM and then employs an +LSTM-based verification stage. The LSTM learns from both the LLM's high recall +and human expert's high precision, using a custom loss function. Our model is +the only approach that simultaneously achieves state-of-the-art results in +medical coding accuracy, accuracy on rare codes, and sentence-level evidence +identification to support coding decisions without training on human-annotated +evidence according to experiments on the MIMIC dataset. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 19 pages +
+
+
+
+
+ + ☆ Comparison of pipeline, sequence-to-sequence, and GPT models for + end-to-end relation extraction: experiments with the rare disease use-case + + +
+ End-to-end relation extraction (E2ERE) is an important and realistic +application of natural language processing (NLP) in biomedicine. In this paper, +we aim to compare three prevailing paradigms for E2ERE using a complex dataset +focused on rare diseases involving discontinuous and nested entities. We use +the RareDis information extraction dataset to evaluate three competing +approaches (for E2ERE): NER $\rightarrow$ RE pipelines, joint sequence to +sequence models, and generative pre-trained transformer (GPT) models. We use +comparable state-of-the-art models and best practices for each of these +approaches and conduct error analyses to assess their failure modes. Our +findings reveal that pipeline models are still the best, while +sequence-to-sequence models are not far behind; GPT models with eight times as +many parameters are worse than even sequence-to-sequence models and lose to +pipeline models by over 10 F1 points. Partial matches and discontinuous +entities caused many NER errors contributing to lower overall E2E performances. +We also verify these findings on a second E2ERE dataset for chemical-protein +interactions. Although generative LM-based methods are more suitable for +zero-shot settings, when training data is available, our results show that it +is better to work with more conventional models trained and tailored for E2ERE. +More innovative methods are needed to marry the best of the both worlds from +smaller encoder-decoder pipeline models and the larger GPT models to improve +E2ERE. As of now, we see that well designed pipeline models offer substantial +performance gains at a lower cost and carbon footprint for E2ERE. Our +contribution is also the first to conduct E2ERE for the RareDis dataset. + +
+
+ comment: The dataset and code for all our experiments are publicly available: + https://github.com/shashank140195/Raredis +
+
+
+
+
+ + ☆ Dynamic Analysis Method for Hidden Dangers in Substation Based on + Knowledge Graph + + +
+ To address the challenge of identifying and understanding hidden dangers in +substations from unstructured text data, a novel dynamic analysis method is +proposed. This approach begins by analyzing and extracting data from the +unstructured text related to hidden dangers. It then leverages a flexible, +distributed data search engine built on Elastic-Search to handle this +information. Following this, the hidden Markov model is employed to train the +data within the engine. The Viterbi algorithm is integrated to decipher the +hidden state sequences, facilitating the segmentation and labeling of entities +related to hidden dangers. The final step involves using the Neo4j graph +database to dynamically create a knowledge map that visualizes hidden dangers +in the substation. This method's effectiveness is demonstrated through an +example analysis using data from a specific substation's hidden dangers. + +
+
+
+
+
+ + ☆ MAIRA-1: A specialised large multimodal model for radiology report + generation + + +
+ We present a radiology-specific multimodal model for the task for generating +radiological reports from chest X-rays (CXRs). Our work builds on the idea that +large language model(s) can be equipped with multimodal capabilities through +alignment with pre-trained vision encoders. On natural images, this has been +shown to allow multimodal models to gain image understanding and description +capabilities. Our proposed model (MAIRA-1) leverages a CXR-specific image +encoder in conjunction with a fine-tuned large language model based on +Vicuna-7B, and text-based data augmentation, to produce reports with +state-of-the-art quality. In particular, MAIRA-1 significantly improves on the +radiologist-aligned RadCliQ metric and across all lexical metrics considered. +Manual review of model outputs demonstrates promising fluency and accuracy of +generated reports while uncovering failure modes not captured by existing +evaluation practices. More information and resources can be found on the +project website: https://aka.ms/maira. + +
+
+ comment: 18 pages, 9 tables, 5 figures +
+
+
+
+
+ + ☆ Efficient Transformer Knowledge Distillation: A Performance Review EMNLP 2023 + + +
+ As pretrained transformer language models continue to achieve +state-of-the-art performance, the Natural Language Processing community has +pushed for advances in model compression and efficient attention mechanisms to +address high computational requirements and limited input sequence length. +Despite these separate efforts, no investigation has been done into the +intersection of these two fields. In this work, we provide an evaluation of +model compression via knowledge distillation on efficient attention +transformers. We provide cost-performance trade-offs for the compression of +state-of-the-art efficient attention architectures and the gains made in +performance in comparison to their full attention counterparts. Furthermore, we +introduce a new long-context Named Entity Recognition dataset, GONERD, to train +and test the performance of NER models on long sequences. We find that +distilled efficient attention transformers can preserve a significant amount of +original model performance, preserving up to 98.6% across short-context tasks +(GLUE, SQUAD, CoNLL-2003), up to 94.6% across long-context +Question-and-Answering tasks (HotpotQA, TriviaQA), and up to 98.8% on +long-context Named Entity Recognition (GONERD), while decreasing inference +times by up to 57.8%. We find that, for most models on most tasks, performing +knowledge distillation is an effective method to yield high-performing +efficient attention models with low costs. + +
+
+ comment: Accepted to EMNLP 2023. 12 pages, 1 figure, 11 tables. Models and + data available at https://huggingface.co/giant-oak +
+
+
+
+
+ + ☆ Language Model Inversion + + +
+ Language models produce a distribution over the next token; can we use this +information to recover the prompt tokens? We consider the problem of language +model inversion and show that next-token probabilities contain a surprising +amount of information about the preceding text. Often we can recover the text +in cases where it is hidden from the user, motivating a method for recovering +unknown prompts given only the model's current distribution output. We consider +a variety of model access scenarios, and show how even without predictions for +every token in the vocabulary we can recover the probability vector through +search. On Llama-2 7b, our inversion method reconstructs prompts with a BLEU of +$59$ and token-level F1 of $78$ and recovers $27\%$ of prompts exactly. Code +for reproducing all experiments is available at +http://github.com/jxmorris12/vec2text. + +
+
+
+
+
+ + ☆ Prompt Risk Control: A Rigorous Framework for Responsible Deployment of + Large Language Models NeurIPS 2023 + + +
+ The recent explosion in the capabilities of large language models has led to +a wave of interest in how best to prompt a model to perform a given task. While +it may be tempting to simply choose a prompt based on average performance on a +validation set, this can lead to a deployment where unexpectedly poor responses +are generated, especially for the worst-off users. To mitigate this prospect, +we propose Prompt Risk Control, a lightweight framework for selecting a prompt +based on rigorous upper bounds on families of informative risk measures. We +offer methods for producing bounds on a diverse set of metrics, including +quantities that measure worst-case responses and disparities in generation +quality across the population of users. In addition, we extend the underlying +statistical bounding techniques to accommodate the possibility of distribution +shifts in deployment. Experiments on applications such as open-ended chat, +medical question summarization, and code generation highlight how such a +framework can foster responsible deployment by reducing the risk of the worst +outcomes. + +
+
+ comment: 33 pages, 10 figures, and accepted to the Socially Responsible + Language Modelling Research (SoLaR) workshop at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ GraphCFC: A Directed Graph Based Cross-Modal Feature Complementation + Approach for Multimodal Conversational Emotion Recognition + + +
+ Emotion Recognition in Conversation (ERC) plays a significant part in +Human-Computer Interaction (HCI) systems since it can provide empathetic +services. Multimodal ERC can mitigate the drawbacks of uni-modal approaches. +Recently, Graph Neural Networks (GNNs) have been widely used in a variety of +fields due to their superior performance in relation modeling. In multimodal +ERC, GNNs are capable of extracting both long-distance contextual information +and inter-modal interactive information. Unfortunately, since existing methods +such as MMGCN directly fuse multiple modalities, redundant information may be +generated and diverse information may be lost. In this work, we present a +directed Graph based Cross-modal Feature Complementation (GraphCFC) module that +can efficiently model contextual and interactive information. GraphCFC +alleviates the problem of heterogeneity gap in multimodal fusion by utilizing +multiple subspace extractors and Pair-wise Cross-modal Complementary (PairCC) +strategy. We extract various types of edges from the constructed graph for +encoding, thus enabling GNNs to extract crucial contextual and interactive +information more accurately when performing message passing. Furthermore, we +design a GNN structure called GAT-MLP, which can provide a new unified network +framework for multimodal learning. The experimental results on two benchmark +datasets show that our GraphCFC outperforms the state-of-the-art (SOTA) +approaches. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ♻ ☆ Integrating Pre-trained Language Model into Neural Machine Translation + + +
+ Neural Machine Translation (NMT) has become a significant technology in +natural language processing through extensive research and development. +However, the deficiency of high-quality bilingual language pair data still +poses a major challenge to improving NMT performance. Recent studies have been +exploring the use of contextual information from pre-trained language model +(PLM) to address this problem. Yet, the issue of incompatibility between PLM +and NMT model remains unresolved. This study proposes PLM-integrated NMT +(PiNMT) model to overcome the identified problems. PiNMT model consists of +three critical components, PLM Multi Layer Converter, Embedding Fusion, and +Cosine Alignment, each playing a vital role in providing effective PLM +information to NMT. Furthermore, two training strategies, Separate Learning +Rates and Dual Step Training, are also introduced in this paper. By +implementing the proposed PiNMT model and training strategy, we achieve +state-of-the-art performance on the IWSLT'14 En$\leftrightarrow$De dataset. +This study's outcomes are noteworthy as they demonstrate a novel approach for +efficiently integrating PLM with NMT to overcome incompatibility and enhance +performance. + +
+
+
+
+
+ + ♻ ☆ A Dual-Stream Recurrence-Attention Network With Global-Local Awareness + for Emotion Recognition in Textual Dialog + + +
+ In real-world dialog systems, the ability to understand the user's emotions +and interact anthropomorphically is of great significance. Emotion Recognition +in Conversation (ERC) is one of the key ways to accomplish this goal and has +attracted growing attention. How to model the context in a conversation is a +central aspect and a major challenge of ERC tasks. Most existing approaches +struggle to adequately incorporate both global and local contextual +information, and their network structures are overly sophisticated. For this +reason, we propose a simple and effective Dual-stream Recurrence-Attention +Network (DualRAN), which is based on Recurrent Neural Network (RNN) and +Multi-head ATtention network (MAT). DualRAN eschews the complex components of +current methods and focuses on combining recurrence-based methods with +attention-based ones. DualRAN is a dual-stream structure mainly consisting of +local- and global-aware modules, modeling a conversation simultaneously from +distinct perspectives. In addition, we develop two single-stream network +variants for DualRAN, i.e., SingleRANv1 and SingleRANv2. According to the +experimental findings, DualRAN boosts the weighted F1 scores by 1.43% and 0.64% +on the IEMOCAP and MELD datasets, respectively, in comparison to the strongest +baseline. On two other datasets (i.e., EmoryNLP and DailyDialog), our method +also attains competitive results. + +
+
+ comment: Accepted by Engineering Applications of Artificial Intelligence + (EAAI) +
+
+
+
+
+ + ♻ ☆ Active Learning Principles for In-Context Learning with Large Language + Models EMNLP + + +
+ The remarkable advancements in large language models (LLMs) have +significantly enhanced the performance in few-shot learning settings. By using +only a small number of labeled examples, referred to as demonstrations, LLMs +can effectively grasp the task at hand through in-context learning. However, +the process of selecting appropriate demonstrations has received limited +attention in prior work. This paper addresses the issue of identifying the most +informative demonstrations for few-shot learning by approaching it as a +pool-based Active Learning (AL) problem over a single iteration. Our objective +is to investigate how AL algorithms can serve as effective demonstration +selection methods for in-context learning. We compare various standard AL +algorithms based on uncertainty, diversity, and similarity, and consistently +observe that the latter outperforms all other methods, including random +sampling. Notably, uncertainty sampling, despite its success in conventional +supervised learning scenarios, performs poorly in this context. Our extensive +experimentation involving a diverse range of GPT and OPT models across $24$ +classification and multi-choice tasks, coupled with thorough analysis, +unambiguously demonstrates that in-context example selection through AL +prioritizes high-quality examples that exhibit low uncertainty and bear +similarity to the test examples. + +
+
+ comment: To appear at Findings of EMNLP (Camera Ready version) +
+
+
+
+
+ + ♻ ☆ HARE: Explainable Hate Speech Detection with Step-by-Step Reasoning EMNLP 2023 + + +
+ With the proliferation of social media, accurate detection of hate speech has +become critical to ensure safety online. To combat nuanced forms of hate +speech, it is important to identify and thoroughly explain hate speech to help +users understand its harmful effects. Recent benchmarks have attempted to +tackle this issue by training generative models on free-text annotations of +implications in hateful text. However, we find significant reasoning gaps in +the existing annotations schemes, which may hinder the supervision of detection +models. In this paper, we introduce a hate speech detection framework, HARE, +which harnesses the reasoning capabilities of large language models (LLMs) to +fill these gaps in explanations of hate speech, thus enabling effective +supervision of detection models. Experiments on SBIC and Implicit Hate +benchmarks show that our method, using model-generated data, consistently +outperforms baselines, using existing free-text human annotations. Analysis +demonstrates that our method enhances the explanation quality of trained models +and improves generalization to unseen datasets. Our code is available at +https://github.com/joonkeekim/hare-hate-speech.git. + +
+
+ comment: Findings of EMNLP 2023; The first three authors contribute equally +
+
+
+
+
+ + ♻ ☆ In-Context Learning Functions with Varying Number of Minima + + +
+ Large Language Models (LLMs) have proven effective at In-Context Learning +(ICL), an ability that allows them to create predictors from labeled examples. +Few studies have explored the interplay between ICL and specific properties of +functions it attempts to approximate. In our study, we use a formal framework +to explore ICL and propose a new task of approximating functions with varying +number of minima. We implement a method that allows for producing functions +with given inputs as minima. We find that increasing the number of minima +degrades ICL performance. At the same time, our evaluation shows that ICL +outperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster +than 2NN in all settings. We validate the findings through a set of few-shot +experiments across various hyperparameter configurations. + +
+
+
+
+
+ + ♻ ☆ Faithful Explanations of Black-box NLP Models Using LLM-generated + Counterfactuals + + +
+ Causal explanations of the predictions of NLP systems are essential to ensure +safety and establish trust. Yet, existing methods often fall short of +explaining model predictions effectively or efficiently and are often +model-specific. In this paper, we address model-agnostic explanations, +proposing two approaches for counterfactual (CF) approximation. The first +approach is CF generation, where a large language model (LLM) is prompted to +change a specific text concept while keeping confounding concepts unchanged. +While this approach is demonstrated to be very effective, applying LLM at +inference-time is costly. We hence present a second approach based on matching, +and propose a method that is guided by an LLM at training-time and learns a +dedicated embedding space. This space is faithful to a given causal graph and +effectively serves to identify matches that approximate CFs. After showing +theoretically that approximating CFs is required in order to construct faithful +explanations, we benchmark our approaches and explain several models, including +LLMs with billions of parameters. Our empirical results demonstrate the +excellent performance of CF generation models as model-agnostic explainers. +Moreover, our matching approach, which requires far less test-time resources, +also provides effective explanations, surpassing many baselines. We also find +that Top-K techniques universally improve every tested method. Finally, we +showcase the potential of LLMs in constructing new benchmarks for model +explanation and subsequently validate our conclusions. Our work illuminates new +pathways for efficient and accurate approaches to interpreting NLP systems. + +
+
+
+
+
+ + ♻ ☆ FreshLLMs: Refreshing Large Language Models with Search Engine + Augmentation + + +
+ Most large language models (LLMs) are trained once and never updated; thus, +they lack the ability to dynamically adapt to our ever-changing world. In this +work, we perform a detailed study of the factuality of LLM-generated text in +the context of answering questions that test current world knowledge. +Specifically, we introduce FreshQA, a novel dynamic QA benchmark encompassing a +diverse range of question and answer types, including questions that require +fast-changing world knowledge as well as questions with false premises that +need to be debunked. We benchmark a diverse array of both closed and +open-source LLMs under a two-mode evaluation procedure that allows us to +measure both correctness and hallucination. Through human evaluations involving +more than 50K judgments, we shed light on limitations of these models and +demonstrate significant room for improvement: for instance, all models +(regardless of model size) struggle on questions that involve fast-changing +knowledge and false premises. Motivated by these results, we present +FreshPrompt, a simple few-shot prompting method that substantially boosts the +performance of an LLM on FreshQA by incorporating relevant and up-to-date +information retrieved from a search engine into the prompt. Our experiments +show that FreshPrompt outperforms both competing search engine-augmented +prompting methods such as Self-Ask (Press et al., 2022) as well as commercial +systems such as Perplexity.AI. Further analysis of FreshPrompt reveals that +both the number of retrieved evidences and their order play a key role in +influencing the correctness of LLM-generated answers. Additionally, instructing +the LLM to generate concise and direct answers helps reduce hallucination +compared to encouraging more verbose answers. To facilitate future work, we +release FreshQA at github.com/freshllms/freshqa and commit to updating it at +regular intervals. + +
+
+ comment: Preprint, 26 pages, 10 figures, 5 tables; Added FreshEval +
+
+
+
+
+ + ♻ ☆ Lifelong Sequence Generation with Dynamic Module Expansion and + Adaptation + + +
+ Lifelong sequence generation (LSG), a problem in continual learning, aims to +continually train a model on a sequence of generation tasks to learn constantly +emerging new generation patterns while avoiding the forgetting of previous +knowledge. Existing LSG methods mainly focus on maintaining old knowledge while +paying little attention to knowledge transfer across tasks. In contrast, humans +can better learn new tasks by leveraging previously acquired knowledge from +similar tasks. Inspired by the learning paradigm of humans, we propose Dynamic +Module Expansion and Adaptation (DMEA), which enables the model to dynamically +determine the architecture for acquiring new knowledge based on task +correlation and select the most similar previous tasks to facilitate adaptation +to new tasks. In addition, as the learning process can easily be biased towards +the current task which might cause more severe forgetting of previously learned +knowledge, we propose dynamic gradient scaling to balance the learning of the +current task and replayed tasks. With extensive experiments, we demonstrate +that DMEA can consistently outperform existing methods in different LSG +settings. + +
+
+
+
+
+ + ♻ ☆ On the Representational Capacity of Recurrent Neural Language Models EMNLP 2023 + + +
+ This work investigates the computational expressivity of language models +(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992) +famously showed that RNNs with rational weights and hidden states and unbounded +computation time are Turing complete. However, LMs define weightings over +strings in addition to just (unweighted) language membership and the analysis +of the computational power of RNN LMs (RLMs) should reflect this. We extend the +Turing completeness result to the probabilistic case, showing how a rationally +weighted RLM with unbounded computation time can simulate any deterministic +probabilistic Turing machine (PTM) with rationally weighted transitions. Since, +in practice, RLMs work in real-time, processing a symbol at every time step, we +treat the above result as an upper bound on the expressivity of RLMs. We also +provide a lower bound by showing that under the restriction to real-time +computation, such models can simulate deterministic real-time rational PTMs. + +
+
+ comment: To be published at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ WOT-Class: Weakly Supervised Open-world Text Classification CIKM 2023 + + +
+ State-of-the-art weakly supervised text classification methods, while +significantly reduced the required human supervision, still requires the +supervision to cover all the classes of interest. This is never easy to meet in +practice when human explore new, large corpora without complete pictures. In +this paper, we work on a novel yet important problem of weakly supervised +open-world text classification, where supervision is only needed for a few +examples from a few known classes and the machine should handle both known and +unknown classes in test time. General open-world classification has been +studied mostly using image classification; however, existing methods typically +assume the availability of sufficient known-class supervision and strong +unknown-class prior knowledge (e.g., the number and/or data distribution). We +propose a novel framework WOT-Class that lifts those strong assumptions. +Specifically, it follows an iterative process of (a) clustering text to new +classes, (b) mining and ranking indicative words for each class, and (c) +merging redundant classes by using the overlapped indicative words as a bridge. +Extensive experiments on 7 popular text classification datasets demonstrate +that WOT-Class outperforms strong baselines consistently with a large margin, +attaining 23.33% greater average absolute macro-F1 over existing approaches +across all datasets. Such competent accuracy illuminates the practical +potential of further reducing human effort for text classification. + +
+
+ comment: Accepted by CIKM 2023 +
+
+
+
+
+ + ♻ ☆ The Song Describer Dataset: a Corpus of Audio Captions for + Music-and-Language Evaluation NeurIPS 2023 + + +
+ We introduce the Song Describer dataset (SDD), a new crowdsourced corpus of +high-quality audio-caption pairs, designed for the evaluation of +music-and-language models. The dataset consists of 1.1k human-written natural +language descriptions of 706 music recordings, all publicly accessible and +released under Creative Common licenses. To showcase the use of our dataset, we +benchmark popular models on three key music-and-language tasks (music +captioning, text-to-music generation and music-language retrieval). Our +experiments highlight the importance of cross-dataset evaluation and offer +insights into how researchers can use SDD to gain a broader understanding of +model performance. + +
+
+ comment: Accepted to NeurIPS 2023 Workshop on Machine Learning for Audio +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 103 + +
+
+
+ + ☆ Retrieval-Augmented Layout Transformer for Content-Aware Layout + Generation + + +
+ Content-aware graphic layout generation aims to automatically arrange visual +elements along with a given content, such as an e-commerce product image. In +this paper, we argue that the current layout generation approaches suffer from +the limited training data for the high-dimensional layout structure. We show +that a simple retrieval augmentation can significantly improve the generation +quality. Our model, which is named Retrieval-Augmented Layout Transformer +(RALF), retrieves nearest neighbor layout examples based on an input image and +feeds these results into an autoregressive generator. Our model can apply +retrieval augmentation to various controllable generation tasks and yield +high-quality layouts within a unified architecture. Our extensive experiments +show that RALF successfully generates content-aware layouts in both constrained +and unconstrained settings and significantly outperforms the baselines. + +
+
+ comment: Webpage: https://udonda.github.io/RALF/ +
+
+
+
+
+ + ☆ Visual In-Context Prompting + + +
+ In-context prompting in large language models (LLMs) has become a prevalent +approach to improve zero-shot capabilities, but this idea is less explored in +the vision domain. Existing visual prompting methods focus on referring +segmentation to segment the most relevant object, falling short of addressing +many generic vision tasks like open-set segmentation and detection. In this +paper, we introduce a universal visual in-context prompting framework for both +tasks. In particular, we build on top of an encoder-decoder architecture, and +develop a versatile prompt encoder to support a variety of prompts like +strokes, boxes, and points. We further enhance it to take an arbitrary number +of reference image segments as the context. Our extensive explorations show +that the proposed visual in-context prompting elicits extraordinary referring +and generic segmentation capabilities to refer and detect, yielding competitive +performance to close-set in-domain datasets and showing promising results on +many open-set segmentation datasets. By joint training on COCO and SA-1B, our +model achieves $57.7$ PQ on COCO and $23.2$ PQ on ADE20K. Code will be +available at https://github.com/UX-Decoder/DINOv. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ ZipLoRA: Any Subject in Any Style by Effectively Merging LoRAs + + +
+ Methods for finetuning generative models for concept-driven personalization +generally achieve strong results for subject-driven or style-driven generation. +Recently, low-rank adaptations (LoRA) have been proposed as a +parameter-efficient way of achieving concept-driven personalization. While +recent work explores the combination of separate LoRAs to achieve joint +generation of learned styles and subjects, existing techniques do not reliably +address the problem; they often compromise either subject fidelity or style +fidelity. We propose ZipLoRA, a method to cheaply and effectively merge +independently trained style and subject LoRAs in order to achieve generation of +any user-provided subject in any user-provided style. Experiments on a wide +range of subject and style combinations show that ZipLoRA can generate +compelling results with meaningful improvements over baselines in subject and +style fidelity while preserving the ability to recontextualize. Project page: +https://ziplora.github.io + +
+
+ comment: Project page: https://ziplora.github.io +
+
+
+
+
+ + ☆ T-Rex: Counting by Visual Prompting + + +
+ We introduce T-Rex, an interactive object counting model designed to first +detect and then count any objects. We formulate object counting as an open-set +object detection task with the integration of visual prompts. Users can specify +the objects of interest by marking points or boxes on a reference image, and +T-Rex then detects all objects with a similar pattern. Guided by the visual +feedback from T-Rex, users can also interactively refine the counting results +by prompting on missing or falsely-detected objects. T-Rex has achieved +state-of-the-art performance on several class-agnostic counting benchmarks. To +further exploit its potential, we established a new counting benchmark +encompassing diverse scenarios and challenges. Both quantitative and +qualitative results show that T-Rex possesses exceptional zero-shot counting +capabilities. We also present various practical application scenarios for +T-Rex, illustrating its potential in the realm of visual prompting. + +
+
+ comment: Technical report. Work in progress +
+
+
+
+
+ + ☆ XAGen: 3D Expressive Human Avatars Generation NeurIPS 2023 + + +
+ Recent advances in 3D-aware GAN models have enabled the generation of +realistic and controllable human body images. However, existing methods focus +on the control of major body joints, neglecting the manipulation of expressive +attributes, such as facial expressions, jaw poses, hand poses, and so on. In +this work, we present XAGen, the first 3D generative model for human avatars +capable of expressive control over body, face, and hands. To enhance the +fidelity of small-scale regions like face and hands, we devise a multi-scale +and multi-part 3D representation that models fine details. Based on this +representation, we propose a multi-part rendering technique that disentangles +the synthesis of body, face, and hands to ease model training and enhance +geometric quality. Furthermore, we design multi-part discriminators that +evaluate the quality of the generated avatars with respect to their appearance +and fine-grained control capabilities. Experiments show that XAGen surpasses +state-of-the-art methods in terms of realism, diversity, and expressive control +abilities. Code and data will be made available at +https://showlab.github.io/xagen. + +
+
+ comment: Accepted to NeurIPS 2023, Project Page at + https://showlab.github.io/xagen +
+
+
+
+
+ + ☆ WildFusion: Learning 3D-Aware Latent Diffusion Models in View Space + + +
+ Modern learning-based approaches to 3D-aware image synthesis achieve high +photorealism and 3D-consistent viewpoint changes for the generated images. +Existing approaches represent instances in a shared canonical space. However, +for in-the-wild datasets a shared canonical system can be difficult to define +or might not even exist. In this work, we instead model instances in view +space, alleviating the need for posed images and learned camera distributions. +We find that in this setting, existing GAN-based methods are prone to +generating flat geometry and struggle with distribution coverage. We hence +propose WildFusion, a new approach to 3D-aware image synthesis based on latent +diffusion models (LDMs). We first train an autoencoder that infers a compressed +latent representation, which additionally captures the images' underlying 3D +structure and enables not only reconstruction but also novel view synthesis. To +learn a faithful 3D representation, we leverage cues from monocular depth +prediction. Then, we train a diffusion model in the 3D-aware latent space, +thereby enabling synthesis of high-quality 3D-consistent image samples, +outperforming recent state-of-the-art GAN-based methods. Importantly, our +3D-aware LDM is trained without any direct supervision from multiview images or +3D geometry and does not require posed images or learned pose or camera +distributions. It directly learns a 3D representation without relying on +canonical camera coordinates. This opens up promising research avenues for +scalable 3D-aware image synthesis and 3D content creation from in-the-wild +image data. See https://katjaschwarz.github.io/wildfusion for videos of our 3D +results. + +
+
+
+
+
+ + ☆ Soulstyler: Using Large Language Model to Guide Image Style Transfer for + Target Object ICASSP2024 + + +
+ Image style transfer occupies an important place in both computer graphics +and computer vision. However, most current methods require reference to +stylized images and cannot individually stylize specific objects. To overcome +this limitation, we propose the "Soulstyler" framework, which allows users to +guide the stylization of specific objects in an image through simple textual +descriptions. We introduce a large language model to parse the text and +identify stylization goals and specific styles. Combined with a CLIP-based +semantic visual embedding encoder, the model understands and matches text and +image content. We also introduce a novel localized text-image block matching +loss that ensures that style transfer is performed only on specified target +objects, while non-target regions remain in their original style. Experimental +results demonstrate that our model is able to accurately perform style transfer +on target objects according to textual descriptions without affecting the style +of background regions. Our code will be available at +https://github.com/yisuanwang/Soulstyler. + +
+
+ comment: 5 pages,3 figures,ICASSP2024 +
+
+
+
+
+ + ☆ Transfer Learning-based Real-time Handgun Detection + + +
+ Traditional surveillance systems rely on human attention, limiting their +effectiveness. This study employs convolutional neural networks and transfer +learning to develop a real-time computer vision system for automatic handgun +detection. Comprehensive analysis of online handgun detection methods is +conducted, emphasizing reducing false positives and learning time. Transfer +learning is demonstrated as an effective approach. Despite technical +challenges, the proposed system achieves a precision rate of 84.74%, +demonstrating promising performance comparable to related works, enabling +faster learning and accurate automatic handgun detection for enhanced security. +This research advances security measures by reducing human monitoring +dependence, showcasing the potential of transfer learning-based approaches for +efficient and reliable handgun detection. + +
+
+
+
+
+ + ☆ ADriver-I: A General World Model for Autonomous Driving + + +
+ Typically, autonomous driving adopts a modular design, which divides the full +stack into perception, prediction, planning and control parts. Though +interpretable, such modular design tends to introduce a substantial amount of +redundancy. Recently, multimodal large language models (MLLM) and diffusion +techniques have demonstrated their superior performance on comprehension and +generation ability. In this paper, we first introduce the concept of +interleaved vision-action pair, which unifies the format of visual features and +control signals. Based on the vision-action pairs, we construct a general world +model based on MLLM and diffusion model for autonomous driving, termed +ADriver-I. It takes the vision-action pairs as inputs and autoregressively +predicts the control signal of the current frame. The generated control signals +together with the historical vision-action pairs are further conditioned to +predict the future frames. With the predicted next frame, ADriver-I performs +further control signal prediction. Such a process can be repeated infinite +times, ADriver-I achieves autonomous driving in the world created by itself. +Extensive experiments are conducted on nuScenes and our large-scale private +datasets. ADriver-I shows impressive performance compared to several +constructed baselines. We hope our ADriver-I can provide some new insights for +future autonomous driving and embodied intelligence. + +
+
+ comment: Tech Report +
+
+
+
+
+ + ☆ Medical Image Retrieval Using Pretrained Embeddings + + +
+ A wide range of imaging techniques and data formats available for medical +images make accurate retrieval from image databases challenging. + Efficient retrieval systems are crucial in advancing medical research, +enabling large-scale studies and innovative diagnostic tools. Thus, addressing +the challenges of medical image retrieval is essential for the continued +enhancement of healthcare and research. + In this study, we evaluated the feasibility of employing four +state-of-the-art pretrained models for medical image retrieval at modality, +body region, and organ levels and compared the results of two similarity +indexing approaches. Since the employed networks take 2D images, we analyzed +the impacts of weighting and sampling strategies to incorporate 3D information +during retrieval of 3D volumes. We showed that medical image retrieval is +feasible using pretrained networks without any additional training or +fine-tuning steps. Using pretrained embeddings, we achieved a recall of 1 for +various tasks at modality, body region, and organ level. + +
+
+ comment: 8 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ DiffusionMat: Alpha Matting as Sequential Refinement Learning + + +
+ In this paper, we introduce DiffusionMat, a novel image matting framework +that employs a diffusion model for the transition from coarse to refined alpha +mattes. Diverging from conventional methods that utilize trimaps merely as +loose guidance for alpha matte prediction, our approach treats image matting as +a sequential refinement learning process. This process begins with the addition +of noise to trimaps and iteratively denoises them using a pre-trained diffusion +model, which incrementally guides the prediction towards a clean alpha matte. +The key innovation of our framework is a correction module that adjusts the +output at each denoising step, ensuring that the final result is consistent +with the input image's structures. We also introduce the Alpha Reliability +Propagation, a novel technique designed to maximize the utility of available +guidance by selectively enhancing the trimap regions with confident alpha +information, thus simplifying the correction task. To train the correction +module, we devise specialized loss functions that target the accuracy of the +alpha matte's edges and the consistency of its opaque and transparent regions. +We evaluate our model across several image matting benchmarks, and the results +indicate that DiffusionMat consistently outperforms existing methods. Project +page at~\url{https://cnnlstm.github.io/DiffusionMat + +
+
+
+
+
+ + ☆ Leveraging CNNs and Ensemble Learning for Automated Disaster Image + Classification SC + + +
+ Natural disasters act as a serious threat globally, requiring effective and +efficient disaster management and recovery. This paper focuses on classifying +natural disaster images using Convolutional Neural Networks (CNNs). Multiple +CNN architectures were built and trained on a dataset containing images of +earthquakes, floods, wildfires, and volcanoes. A stacked CNN ensemble approach +proved to be the most effective, achieving 95% accuracy and an F1 score going +up to 0.96 for individual classes. Tuning hyperparameters of individual models +for optimization was critical to maximize the models' performance. The stacking +of CNNs with XGBoost acting as the meta-model utilizes the strengths of the CNN +and ResNet models to improve the overall accuracy of the classification. +Results obtained from the models illustrated the potency of CNN-based models +for automated disaster image classification. This lays the foundation for +expanding these techniques to build robust systems for disaster response, +damage assessment, and recovery management. + +
+
+ comment: 13 pages, 11 figures, 4 tables, ICSISCET 2023 Conference +
+
+
+
+
+ + ☆ Hybrid Whale-Mud-Ring Optimization for Precise Color Skin Cancer Image + Segmentation + + +
+ Timely identification and treatment of rapidly progressing skin cancers can +significantly contribute to the preservation of patients' health and +well-being. Dermoscopy, a dependable and accessible tool, plays a pivotal role +in the initial stages of skin cancer detection. Consequently, the effective +processing of digital dermoscopy images holds significant importance in +elevating the accuracy of skin cancer diagnoses. Multilevel thresholding is a +key tool in medical imaging that extracts objects within the image to +facilitate its analysis. In this paper, an enhanced version of the Mud Ring +Algorithm hybridized with the Whale Optimization Algorithm, named WMRA, is +proposed. The proposed approach utilizes bubble-net attack and mud ring +strategy to overcome stagnation in local optima and obtain optimal thresholds. +The experimental results show that WMRA is powerful against a cluster of recent +methods in terms of fitness, Peak Signal to Noise Ratio (PSNR), and Mean Square +Error (MSE). + +
+
+
+
+
+ + ☆ Deep-learning-based acceleration of MRI for radiotherapy planning of + pediatric patients with brain tumors + + +
+ Magnetic Resonance Imaging (MRI) is a non-invasive diagnostic and +radiotherapy (RT) planning tool, offering detailed insights into the anatomy of +the human body. The extensive scan time is stressful for patients, who must +remain motionless in a prolonged imaging procedure that prioritizes reduction +of imaging artifacts. This is challenging for pediatric patients who may +require measures for managing voluntary motions such as anesthesia. Several +computational approaches reduce scan time (fast MRI), by recording fewer +measurements and digitally recovering full information via post-acquisition +reconstruction. However, most fast MRI approaches were developed for diagnostic +imaging, without addressing reconstruction challenges specific to RT planning. +In this work, we developed a deep learning-based method (DeepMRIRec) for MRI +reconstruction from undersampled data acquired with RT-specific receiver coil +arrangements. We evaluated our method against fully sampled data of T1-weighted +MR images acquired from 73 children with brain tumors/surgical beds using loop +and posterior coils (12 channels), with and without applying virtual +compression of coil elements. DeepMRIRec reduced scanning time by a factor of +four producing a structural similarity score surpassing the evaluated +state-of-the-art method (0.960 vs 0.896), thereby demonstrating its potential +for accelerating MRI scanning for RT planning. + +
+
+
+
+
+ + ☆ SkeletonGait: Gait Recognition Using Skeleton Maps + + +
+ The choice of the representations is essential for deep gait recognition +methods. The binary silhouettes and skeletal coordinates are two dominant +representations in recent literature, achieving remarkable advances in many +scenarios. However, inherent challenges remain, in which silhouettes are not +always guaranteed in unconstrained scenes, and structural cues have not been +fully utilized from skeletons. In this paper, we introduce a novel skeletal +gait representation named Skeleton Map, together with SkeletonGait, a +skeleton-based method to exploit structural information from human skeleton +maps. Specifically, the skeleton map represents the coordinates of human joints +as a heatmap with Gaussian approximation, exhibiting a silhouette-like image +devoid of exact body structure. Beyond achieving state-of-the-art performances +over five popular gait datasets, more importantly, SkeletonGait uncovers novel +insights about how important structural features are in describing gait and +when do they play a role. Furthermore, we propose a multi-branch architecture, +named SkeletonGait++, to make use of complementary features from both skeletons +and silhouettes. Experiments indicate that SkeletonGait++ outperforms existing +state-of-the-art methods by a significant margin in various scenarios. For +instance, it achieves an impressive rank-1 accuracy of over $85\%$ on the +challenging GREW dataset. All the source code will be available at +https://github.com/ShiqiYu/OpenGait. + +
+
+
+
+
+ + ☆ Guided Flows for Generative Modeling and Decision Making + + +
+ Classifier-free guidance is a key component for improving the performance of +conditional generative models for many downstream tasks. It drastically +improves the quality of samples produced, but has so far only been used for +diffusion models. Flow Matching (FM), an alternative simulation-free approach, +trains Continuous Normalizing Flows (CNFs) based on regressing vector fields. +It remains an open question whether classifier-free guidance can be performed +for Flow Matching models, and to what extent does it improve performance. In +this paper, we explore the usage of Guided Flows for a variety of downstream +applications involving conditional image generation, speech synthesis, and +reinforcement learning. In particular, we are the first to apply flow models to +the offline reinforcement learning setting. We also show that Guided Flows +significantly improves the sample quality in image generation and zero-shot +text-to-speech synthesis, and can make use of drastically low amounts of +computation without affecting the agent's overall performance. + +
+
+
+
+
+ + ☆ PG-Video-LLaVA: Pixel Grounding Large Video-Language Models + + +
+ Extending image-based Large Multimodal Models (LMM) to videos is challenging +due to the inherent complexity of video data. The recent approaches extending +image-based LMM to videos either lack the grounding capabilities (e.g., +VideoChat, Video-ChatGPT, Video-LLaMA) or do not utilize the audio-signals for +better video understanding (e.g., Video-ChatGPT). Addressing these gaps, we +propose Video-LLaVA, the first LMM with pixel-level grounding capability, +integrating audio cues by transcribing them into text to enrich video-context +understanding. Our framework uses an off-the-shelf tracker and a novel +grounding module, enabling it to spatially and temporally localize objects in +videos following user instructions. We evaluate Video-LLaVA using video-based +generative and question-answering benchmarks and introduce new benchmarks +specifically designed to measure prompt-based object grounding performance in +videos. Further, we propose the use of Vicuna over GPT-3.5, as utilized in +Video-ChatGPT, for video-based conversation benchmarking, ensuring +reproducibility of results which is a concern with the proprietary nature of +GPT-3.5. Our framework builds on SoTA image-based LLaVA model and extends its +advantages to the video domain, delivering promising gains on video-based +conversation and grounding tasks. Project Page: +https://github.com/mbzuai-oryx/Video-LLaVA + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ CompenHR: Efficient Full Compensation for High-resolution Projector + + +
+ Full projector compensation is a practical task of projector-camera systems. +It aims to find a projector input image, named compensation image, such that +when projected it cancels the geometric and photometric distortions due to the +physical environment and hardware. State-of-the-art methods use deep learning +to address this problem and show promising performance for low-resolution +setups. However, directly applying deep learning to high-resolution setups is +impractical due to the long training time and high memory cost. To address this +issue, this paper proposes a practical full compensation solution. Firstly, we +design an attention-based grid refinement network to improve geometric +correction quality. Secondly, we integrate a novel sampling scheme into an +end-to-end compensation network to alleviate computation and introduce +attention blocks to preserve key features. Finally, we construct a benchmark +dataset for high-resolution projector full compensation. In experiments, our +method demonstrates clear advantages in both efficiency and quality. + +
+
+
+
+
+ + ☆ Animatable 3D Gaussians for High-fidelity Synthesis of Human Motions + + +
+ We present a novel animatable 3D Gaussian model for rendering high-fidelity +free-view human motions in real time. Compared to existing NeRF-based methods, +the model owns better capability in synthesizing high-frequency details without +the jittering problem across video frames. The core of our model is a novel +augmented 3D Gaussian representation, which attaches each Gaussian with a +learnable code. The learnable code serves as a pose-dependent appearance +embedding for refining the erroneous appearance caused by geometric +transformation of Gaussians, based on which an appearance refinement model is +learned to produce residual Gaussian properties to match the appearance in +target pose. To force the Gaussians to learn the foreground human only without +background interference, we further design a novel alpha loss to explicitly +constrain the Gaussians within the human body. We also propose to jointly +optimize the human joint parameters to improve the appearance accuracy. The +animatable 3D Gaussian model can be learned with shallow MLPs, so new human +motions can be synthesized in real time (66 fps on avarage). Experiments show +that our model has superior performance over NeRF-based methods. + +
+
+
+
+
+ + ☆ Depth-Regularized Optimization for 3D Gaussian Splatting in Few-Shot + Images + + +
+ In this paper, we present a method to optimize Gaussian splatting with a +limited number of images while avoiding overfitting. Representing a 3D scene by +combining numerous Gaussian splats has yielded outstanding visual quality. +However, it tends to overfit the training views when only a small number of +images are available. To address this issue, we introduce a dense depth map as +a geometry guide to mitigate overfitting. We obtained the depth map using a +pre-trained monocular depth estimation model and aligning the scale and offset +using sparse COLMAP feature points. The adjusted depth aids in the color-based +optimization of 3D Gaussian splatting, mitigating floating artifacts, and +ensuring adherence to geometric constraints. We verify the proposed method on +the NeRF-LLFF dataset with varying numbers of few images. Our approach +demonstrates robust geometry compared to the original method that relies solely +on images. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ SegVol: Universal and Interactive Volumetric Medical Image Segmentation + + +
+ Precise image segmentation provides clinical study with meaningful and +well-structured information. Despite the remarkable progress achieved in +medical image segmentation, there is still an absence of foundation +segmentation model that can segment a wide range of anatomical categories with +easy user interaction. In this paper, we propose a universal and interactive +volumetric medical image segmentation model, named SegVol. By training on 90k +unlabeled Computed Tomography (CT) volumes and 6k labeled CTs, this foundation +model supports the segmentation of over 200 anatomical categories using +semantic and spatial prompts. Extensive experiments verify that SegVol +outperforms the state of the art by a large margin on multiple segmentation +benchmarks. Notably, on three challenging lesion datasets, our method achieves +around 20% higher Dice score than nnU-Net. The model and data are publicly +available at: https://github.com/BAAI-DCAI/SegVol. + +
+
+
+
+
+ + ☆ LucidDreamer: Domain-free Generation of 3D Gaussian Splatting Scenes + + +
+ With the widespread usage of VR devices and contents, demands for 3D scene +generation techniques become more popular. Existing 3D scene generation models, +however, limit the target scene to specific domain, primarily due to their +training strategies using 3D scan dataset that is far from the real-world. To +address such limitation, we propose LucidDreamer, a domain-free scene +generation pipeline by fully leveraging the power of existing large-scale +diffusion-based generative model. Our LucidDreamer has two alternate steps: +Dreaming and Alignment. First, to generate multi-view consistent images from +inputs, we set the point cloud as a geometrical guideline for each image +generation. Specifically, we project a portion of point cloud to the desired +view and provide the projection as a guidance for inpainting using the +generative model. The inpainted images are lifted to 3D space with estimated +depth maps, composing a new points. Second, to aggregate the new points into +the 3D scene, we propose an aligning algorithm which harmoniously integrates +the portions of newly generated 3D scenes. The finally obtained 3D scene serves +as initial points for optimizing Gaussian splats. LucidDreamer produces +Gaussian splats that are highly-detailed compared to the previous 3D scene +generation methods, with no constraint on domain of the target scene. + +
+
+
+
+
+ + ☆ Point Projection Mapping System for Tracking, Registering, Labeling and + Validating Optical Tissue Measurements + + +
+ Validation of newly developed optical tissue sensing techniques for tumor +detection during cancer surgery requires an accurate correlation with +histological results. Additionally, such accurate correlation facilitates +precise data labeling for developing high-performance machine-learning tissue +classification models. In this paper, a newly developed Point Projection +Mapping system will be introduced, which allows non-destructive tracking of the +measurement locations on tissue specimens. Additionally, a framework for +accurate registration, validation, and labeling with histopathology results is +proposed and validated on a case study. The proposed framework provides a more +robust and accurate method for tracking and validation of optical tissue +sensing techniques, which saves time and resources compared to conventional +techniques available. + +
+
+
+
+
+ + ☆ MRGazer: Decoding Eye Gaze Points from Functional Magnetic Resonance + Imaging in Individual Space + + +
+ Eye-tracking research has proven valuable in understanding numerous cognitive +functions. Recently, Frey et al. provided an exciting deep learning method for +learning eye movements from fMRI data. However, it needed to co-register fMRI +into standard space to obtain eyeballs masks, and thus required additional +templates and was time consuming. To resolve this issue, in this paper, we +propose a framework named MRGazer for predicting eye gaze points from fMRI in +individual space. The MRGazer consisted of eyeballs extraction module and a +residual network-based eye gaze prediction. Compared to the previous method, +the proposed framework skips the fMRI co-registration step, simplifies the +processing protocol and achieves end-to-end eye gaze regression. The proposed +method achieved superior performance in a variety of eye movement tasks than +the co-registration-based method, and delivered objective results within a +shorter time (~ 0.02 Seconds for each volume) than prior method (~0.3 Seconds +for each volume). + +
+
+
+
+
+ + ☆ Unified Classification and Rejection: A One-versus-All Framework + + +
+ Classifying patterns of known classes and rejecting ambiguous and novel (also +called as out-of-distribution (OOD)) inputs are involved in open world pattern +recognition. Deep neural network models usually excel in closed-set +classification while performing poorly in rejecting OOD. To tackle this +problem, numerous methods have been designed to perform open set recognition +(OSR) or OOD rejection/detection tasks. Previous methods mostly take +post-training score transformation or hybrid models to ensure low scores on OOD +inputs while separating known classes. In this paper, we attempt to build a +unified framework for building open set classifiers for both classification and +OOD rejection. We formulate the open set recognition of $ K $-known-class as a +$ (K + 1) $-class classification problem with model trained on known-class +samples only. By decomposing the $ K $-class problem into $ K $ one-versus-all +(OVA) binary classification tasks and binding some parameters, we show that +combining the scores of OVA classifiers can give $ (K + 1) $-class posterior +probabilities, which enables classification and OOD rejection in a unified +framework. To maintain the closed-set classification accuracy of the OVA +trained classifier, we propose a hybrid training strategy combining OVA loss +and multi-class cross-entropy loss. We implement the OVA framework and hybrid +training strategy on the recently proposed convolutional prototype network. +Experiments on popular OSR and OOD detection datasets demonstrate that the +proposed framework, using a single multi-class classifier, yields competitive +performance in closed-set classification, OOD detection, and misclassification +detection. + +
+
+
+
+
+ + ☆ High-Quality Face Caricature via Style Translation + + +
+ Caricature is an exaggerated form of artistic portraiture that accentuates +unique yet subtle characteristics of human faces. Recently, advancements in +deep end-to-end techniques have yielded encouraging outcomes in capturing both +style and elevated exaggerations in creating face caricatures. Most of these +approaches tend to produce cartoon-like results that could be more practical +for real-world applications. In this study, we proposed a high-quality, +unpaired face caricature method that is appropriate for use in the real world +and uses computer vision techniques and GAN models. We attain the exaggeration +of facial features and the stylization of appearance through a two-step +process: Face caricature generation and face caricature projection. The face +caricature generation step creates new caricature face datasets from real +images and trains a generative model using the real and newly created +caricature datasets. The Face caricature projection employs an encoder trained +with real and caricature faces with the pretrained generator to project real +and caricature faces. We perform an incremental facial exaggeration from the +real image to the caricature faces using the encoder and generator's latent +space. Our projection preserves the facial identity, attributes, and +expressions from the input image. Also, it accounts for facial occlusions, such +as reading glasses or sunglasses, to enhance the robustness of our model. +Furthermore, we conducted a comprehensive comparison of our approach with +various state-of-the-art face caricature methods, highlighting our process's +distinctiveness and exceptional realism. + +
+
+ comment: 14 pages, 21 figures +
+
+
+
+
+ + ☆ Quantum learning and essential cognition under the traction of + meta-characteristics in an open world + + +
+ Artificial intelligence has made significant progress in the Close World +problem, being able to accurately recognize old knowledge through training and +classification. However, AI faces significant challenges in the Open World +problem, as it involves a new and unknown exploration journey. AI is not +inherently proactive in exploration, and its challenge lies in not knowing how +to approach and adapt to the unknown world. How do humans acquire knowledge of +the unknown world. Humans identify new knowledge through intrinsic cognition. +In the process of recognizing new colors, the cognitive cues are different from +known color features and involve hue, saturation, brightness, and other +characteristics. When AI encounters objects with different features in the new +world, it faces another challenge: where are the distinguishing features +between influential features of new and old objects? AI often mistakes a new +world's brown bear for a known dog because it has not learned the differences +in feature distributions between knowledge systems. This is because things in +the new and old worlds have different units and dimensions for their features. +This paper proposes an open-world model and elemental feature system that +focuses on fundamentally recognizing the distribution differences in objective +features between the new and old worlds. The quantum tunneling effect of +learning ability in the new and old worlds is realized through the tractive +force of meta-characteristic. The outstanding performance of the model system +in learning new knowledge (using pedestrian re-identification datasets as an +example) demonstrates that AI has acquired the ability to recognize the new +world with an accuracy of $96.71\%$ at most and has gained the capability to +explore new knowledge, similar to humans. + +
+
+ comment: 8 pages,5 pages +
+
+
+
+
+ + ☆ Revisiting Supervision for Continual Representation Learning + + +
+ In the field of continual learning, models are designed to learn tasks one +after the other. While most research has centered on supervised continual +learning, recent studies have highlighted the strengths of self-supervised +continual representation learning. The improved transferability of +representations built with self-supervised methods is often associated with the +role played by the multi-layer perceptron projector. In this work, we depart +from this observation and reexamine the role of supervision in continual +representation learning. We reckon that additional information, such as human +annotations, should not deteriorate the quality of representations. Our +findings show that supervised models when enhanced with a multi-layer +perceptron head, can outperform self-supervised models in continual +representation learning. + +
+
+
+
+
+ + ☆ Deep Learning for Vascular Segmentation and Applications in Phase + Contrast Tomography Imaging + + +
+ Automated blood vessel segmentation is vital for biomedical imaging, as +vessel changes indicate many pathologies. Still, precise segmentation is +difficult due to the complexity of vascular structures, anatomical variations +across patients, the scarcity of annotated public datasets, and the quality of +images. We present a thorough literature review, highlighting the state of +machine learning techniques across diverse organs. Our goal is to provide a +foundation on the topic and identify a robust baseline model for application to +vascular segmentation in a new imaging modality, Hierarchical Phase Contrast +Tomography (HiP CT). Introduced in 2020 at the European Synchrotron Radiation +Facility, HiP CT enables 3D imaging of complete organs at an unprecedented +resolution of ca. 20mm per voxel, with the capability for localized zooms in +selected regions down to 1mm per voxel without sectioning. We have created a +training dataset with double annotator validated vascular data from three +kidneys imaged with HiP CT in the context of the Human Organ Atlas Project. +Finally, utilising the nnU Net model, we conduct experiments to assess the +models performance on both familiar and unseen samples, employing vessel +specific metrics. Our results show that while segmentations yielded reasonably +high scores such as clDice values ranging from 0.82 to 0.88, certain errors +persisted. Large vessels that collapsed due to the lack of hydrostatic pressure +(HiP CT is an ex vivo technique) were segmented poorly. Moreover, decreased +connectivity in finer vessels and higher segmentation errors at vessel +boundaries were observed. Such errors obstruct the understanding of the +structures by interrupting vascular tree connectivity. Through our review and +outputs, we aim to set a benchmark for subsequent model evaluations using +various modalities, especially with the HiP CT imaging database. + +
+
+
+
+
+ + ☆ Recognition-Guided Diffusion Model for Scene Text Image Super-Resolution + + +
+ Scene Text Image Super-Resolution (STISR) aims to enhance the resolution and +legibility of text within low-resolution (LR) images, consequently elevating +recognition accuracy in Scene Text Recognition (STR). Previous methods +predominantly employ discriminative Convolutional Neural Networks (CNNs) +augmented with diverse forms of text guidance to address this issue. +Nevertheless, they remain deficient when confronted with severely blurred +images, due to their insufficient generation capability when little structural +or semantic information can be extracted from original images. Therefore, we +introduce RGDiffSR, a Recognition-Guided Diffusion model for scene text image +Super-Resolution, which exhibits great generative diversity and fidelity even +in challenging scenarios. Moreover, we propose a Recognition-Guided Denoising +Network, to guide the diffusion model generating LR-consistent results through +succinct semantic guidance. Experiments on the TextZoom dataset demonstrate the +superiority of RGDiffSR over prior state-of-the-art methods in both text +recognition accuracy and image fidelity. + +
+
+
+
+
+ + ☆ Rethinking Radiology Report Generation via Causal Reasoning and + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as an interaction between +vision and language fields. Previous works inherited the ideology of +vision-to-language generation tasks,aiming to generate paragraphs with high +consistency as reports. However, one unique characteristic of RRG, the +independence between diseases, was neglected, leading to the injection of the +spurious confounder, i.e., the disease co-occurrence. Unfortunately, this +confounder confuses the process of report generation worse because of the +biased RRG data distribution. In this paper, to rethink this issue thoroughly, +we reason about its causes and effects from a novel perspective of statistics +and causality, where the Joint Vision Coupling and the Conditional Sentence +Coherence Coupling are two aspects prone to implicitly decrease the accuracy of +reports. Then, a counterfactual augmentation strategy that contains the +Counterfactual Sample Synthesis and the Counterfactual Report Reconstruction +sub-methods is proposed to break these two aspects of spurious effects. +Experimental results and further analyses on two widely used datasets justify +our reasoning and proposed methods. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ☆ Retargeting Visual Data with Deformation Fields + + +
+ Seam carving is an image editing method that enable content-aware resizing, +including operations like removing objects. However, the seam-finding strategy +based on dynamic programming or graph-cut limits its applications to broader +visual data formats and degrees of freedom for editing. Our observation is that +describing the editing and retargeting of images more generally by a +displacement field yields a generalisation of content-aware deformations. We +propose to learn a deformation with a neural network that keeps the output +plausible while trying to deform it only in places with low information +content. This technique applies to different kinds of visual data, including +images, 3D scenes given as neural radiance fields, or even polygon meshes. +Experiments conducted on different visual data show that our method achieves +better content-aware retargeting compared to previous methods. + +
+
+
+
+
+ + ☆ FedFN: Feature Normalization for Alleviating Data Heterogeneity Problem + in Federated Learning NeurIPS + + +
+ Federated Learning (FL) is a collaborative method for training models while +preserving data privacy in decentralized settings. However, FL encounters +challenges related to data heterogeneity, which can result in performance +degradation. In our study, we observe that as data heterogeneity increases, +feature representation in the FedAVG model deteriorates more significantly +compared to classifier weight. Additionally, we observe that as data +heterogeneity increases, the gap between higher feature norms for observed +classes, obtained from local models, and feature norms of unobserved classes +widens, in contrast to the behavior of classifier weight norms. This widening +gap extends to encompass the feature norm disparities between local and the +global models. To address these issues, we introduce Federated Averaging with +Feature Normalization Update (FedFN), a straightforward learning method. We +demonstrate the superior performance of FedFN through extensive experiments, +even when applied to pretrained ResNet18. Subsequently, we confirm the +applicability of FedFN to foundation models. + +
+
+ comment: NeurIPS Workshop: "Federated Learning in the Age of Foundation + Models" 2023 +
+
+
+
+
+ + ☆ CMFDFormer: Transformer-based Copy-Move Forgery Detection with Continual + Learning + + +
+ Copy-move forgery detection aims at detecting duplicated regions in a +suspected forged image, and deep learning based copy-move forgery detection +methods are in the ascendant. These deep learning based methods heavily rely on +synthetic training data, and the performance will degrade when facing new +tasks. In this paper, we propose a Transformer-style copy-move forgery +detection network named as CMFDFormer, and provide a novel PCSD (Pooled Cube +and Strip Distillation) continual learning framework to help CMFDFormer handle +new tasks. CMFDFormer consists of a MiT (Mix Transformer) backbone network and +a PHD (Pluggable Hybrid Decoder) mask prediction network. The MiT backbone +network is a Transformer-style network which is adopted on the basis of +comprehensive analyses with CNN-style and MLP-style backbones. The PHD network +is constructed based on self-correlation computation, hierarchical feature +integration, a multi-scale cycle fully-connected block and a mask +reconstruction block. The PHD network is applicable to feature extractors of +different styles for hierarchical multi-scale information extraction, achieving +comparable performance. Last but not least, we propose a PCSD continual +learning framework to improve the forgery detectability and avoid catastrophic +forgetting when handling new tasks. Our continual learning framework restricts +intermediate features from the PHD network, and takes advantage of both cube +pooling and strip pooling. Extensive experiments on publicly available datasets +demonstrate the good performance of CMFDFormer and the effectiveness of the +PCSD continual learning framework. + +
+
+ comment: 12pages,6 figures +
+
+
+
+
+ + ☆ Immunohistochemistry guided segmentation of benign epithelial cells, in + situ lesions, and invasive epithelial cells in breast cancer slides + + +
+ Digital pathology enables automatic analysis of histopathological sections +using artificial intelligence (AI). Automatic evaluation could improve +diagnostic efficiency and help find associations between morphological features +and clinical outcome. For development of such prediction models, identifying +invasive epithelial cells, and separating these from benign epithelial cells +and in situ lesions would be the first step. In this study, we aimed to develop +an AI model for segmentation of epithelial cells in sections from breast +cancer. We generated epithelial ground truth masks by restaining hematoxylin +and eosin (HE) sections with cytokeratin (CK) AE1/AE3, and by pathologists' +annotations. HE/CK image pairs were used to train a convolutional neural +network, and data augmentation was used to make the model more robust. Tissue +microarrays (TMAs) from 839 patients, and whole slide images from two patients +were used for training and evaluation of the models. The sections were derived +from four cohorts of breast cancer patients. TMAs from 21 patients from a fifth +cohort was used as a second test set. In quantitative evaluation, a mean Dice +score of 0.70, 0.79, and 0.75 for invasive epithelial cells, benign epithelial +cells, and in situ lesions, respectively, were achieved. In qualitative scoring +(0-5) by pathologists, results were best for all epithelium and invasive +epithelium, with scores of 4.7 and 4.4. Scores for benign epithelium and in +situ lesions were 3.7 and 2.0. The proposed model segmented epithelial cells in +HE stained breast cancer slides well, but further work is needed for accurate +division between the classes. Immunohistochemistry, together with pathologists' +annotations, enabled the creation of accurate ground truths. The model is made +freely available in FastPathology and the code is available at +https://github.com/AICAN-Research/breast-epithelium-segmentation + +
+
+ comment: 19 pages, 6 figures. Submitted to a scientific journal +
+
+
+
+
+ + ☆ ViStruct: Visual Structural Knowledge Extraction via Curriculum Guided + Code-Vision Representation EMNLP 2023 + + +
+ State-of-the-art vision-language models (VLMs) still have limited performance +in structural knowledge extraction, such as relations between objects. In this +work, we present ViStruct, a training framework to learn VLMs for effective +visual structural knowledge extraction. Two novel designs are incorporated. +First, we propose to leverage the inherent structure of programming language to +depict visual structural information. This approach enables explicit and +consistent representation of visual structural information of multiple +granularities, such as concepts, relations, and events, in a well-organized +structured format. Second, we introduce curriculum-based learning for VLMs to +progressively comprehend visual structures, from fundamental visual concepts to +intricate event structures. Our intuition is that lower-level knowledge may +contribute to complex visual structure understanding. Furthermore, we compile +and release a collection of datasets tailored for visual structural knowledge +extraction. We adopt a weakly-supervised approach to directly generate visual +event structures from captions for ViStruct training, capitalizing on abundant +image-caption pairs from the web. In experiments, we evaluate ViStruct on +visual structure prediction tasks, demonstrating its effectiveness in improving +the understanding of visual structures. The code is public at +\url{https://github.com/Yangyi-Chen/vi-struct}. + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ☆ DA-STC: Domain Adaptive Video Semantic Segmentation via Spatio-Temporal + Consistency + + +
+ Video semantic segmentation is a pivotal aspect of video representation +learning. However, significant domain shifts present a challenge in effectively +learning invariant spatio-temporal features across the labeled source domain +and unlabeled target domain for video semantic segmentation. To solve the +challenge, we propose a novel DA-STC method for domain adaptive video semantic +segmentation, which incorporates a bidirectional multi-level spatio-temporal +fusion module and a category-aware spatio-temporal feature alignment module to +facilitate consistent learning for domain-invariant features. Firstly, we +perform bidirectional spatio-temporal fusion at the image sequence level and +shallow feature level, leading to the construction of two fused intermediate +video domains. This prompts the video semantic segmentation model to +consistently learn spatio-temporal features of shared patch sequences which are +influenced by domain-specific contexts, thereby mitigating the feature gap +between the source and target domain. Secondly, we propose a category-aware +feature alignment module to promote the consistency of spatio-temporal +features, facilitating adaptation to the target domain. Specifically, we +adaptively aggregate the domain-specific deep features of each category along +spatio-temporal dimensions, which are further constrained to achieve +cross-domain intra-class feature alignment and inter-class feature separation. +Extensive experiments demonstrate the effectiveness of our method, which +achieves state-of-the-art mIOUs on multiple challenging benchmarks. +Furthermore, we extend the proposed DA-STC to the image domain, where it also +exhibits superior performance for domain adaptive semantic segmentation. The +source code and models will be made available at +\url{https://github.com/ZHE-SAPI/DA-STC}. + +
+
+ comment: 18 pages,9 figures +
+
+
+
+
+ + ☆ Towards Hetero-Client Federated Multi-Task Learning + + +
+ Federated Learning (FL) enables joint training across distributed clients +using their local data privately. Federated Multi-Task Learning (FMTL) builds +on FL to handle multiple tasks, assuming model congruity that identical model +architecture is deployed in each client. To relax this assumption and thus +extend real-world applicability, we introduce a novel problem setting, +Hetero-Client Federated Multi-Task Learning (HC-FMTL), to accommodate diverse +task setups. The main challenge of HC-FMTL is the model incongruity issue that +invalidates conventional aggregation methods. It also escalates the +difficulties in accurate model aggregation to deal with data and task +heterogeneity inherent in FMTL. To address these challenges, we propose the +FedHCA$^2$ framework, which allows for federated training of personalized +models by modeling relationships among heterogeneous clients. Drawing on our +theoretical insights into the difference between multi-task and federated +optimization, we propose the Hyper Conflict-Averse Aggregation scheme to +mitigate conflicts during encoder updates. Additionally, inspired by task +interaction in MTL, the Hyper Cross Attention Aggregation scheme uses +layer-wise cross attention to enhance decoder interactions while alleviating +model incongruity. Moreover, we employ learnable Hyper Aggregation Weights for +each client to customize personalized parameter updates. Extensive experiments +demonstrate the superior performance of FedHCA$^2$ in various HC-FMTL scenarios +compared to representative methods. Our code will be made publicly available. + +
+
+
+
+
+ + ☆ TSegFormer: 3D Tooth Segmentation in Intraoral Scans with Geometry + Guided Transformer MICCAI 2023 + + +
+ Optical Intraoral Scanners (IOS) are widely used in digital dentistry to +provide detailed 3D information of dental crowns and the gingiva. Accurate 3D +tooth segmentation in IOSs is critical for various dental applications, while +previous methods are error-prone at complicated boundaries and exhibit +unsatisfactory results across patients. In this paper, we propose TSegFormer +which captures both local and global dependencies among different teeth and the +gingiva in the IOS point clouds with a multi-task 3D transformer architecture. +Moreover, we design a geometry-guided loss based on a novel point curvature to +refine boundaries in an end-to-end manner, avoiding time-consuming +post-processing to reach clinically applicable segmentation. In addition, we +create a dataset with 16,000 IOSs, the largest ever IOS dataset to the best of +our knowledge. The experimental results demonstrate that our TSegFormer +consistently surpasses existing state-of-the-art baselines. The superiority of +TSegFormer is corroborated by extensive analysis, visualizations and real-world +clinical applicability tests. Our code is available at +https://github.com/huiminxiong/TSegFormer. + +
+
+ comment: MICCAI 2023, STAR(Student Travel) award. 11 pages, 3 figures, 5 + tables. arXiv admin note: text overlap with arXiv:2210.16627 +
+
+
+
+
+ + ☆ Using Human Feedback to Fine-tune Diffusion Models without Any Reward + Model + + +
+ Using reinforcement learning with human feedback (RLHF) has shown significant +promise in fine-tuning diffusion models. Previous methods start by training a +reward model that aligns with human preferences, then leverage RL techniques to +fine-tune the underlying models. However, crafting an efficient reward model +demands extensive datasets, optimal architecture, and manual hyperparameter +tuning, making the process both time and cost-intensive. The direct preference +optimization (DPO) method, effective in fine-tuning large language models, +eliminates the necessity for a reward model. However, the extensive GPU memory +requirement of the diffusion model's denoising process hinders the direct +application of the DPO method. To address this issue, we introduce the Direct +Preference for Denoising Diffusion Policy Optimization (D3PO) method to +directly fine-tune diffusion models. The theoretical analysis demonstrates that +although D3PO omits training a reward model, it effectively functions as the +optimal reward model trained using human feedback data to guide the learning +process. This approach requires no training of a reward model, proving to be +more direct, cost-effective, and minimizing computational overhead. In +experiments, our method uses the relative scale of objectives as a proxy for +human preference, delivering comparable results to methods using ground-truth +rewards. Moreover, D3PO demonstrates the ability to reduce image distortion +rates and generate safer images, overcoming challenges lacking robust reward +models. + +
+
+
+
+
+ + ☆ Towards Detecting, Recognizing, and Parsing the Address Information from + Bangla Signboard: A Deep Learning-based Approach + + +
+ Retrieving textual information from natural scene images is an active +research area in the field of computer vision with numerous practical +applications. Detecting text regions and extracting text from signboards is a +challenging problem due to special characteristics like reflecting lights, +uneven illumination, or shadows found in real-life natural scene images. With +the advent of deep learning-based methods, different sophisticated techniques +have been proposed for text detection and text recognition from the natural +scene. Though a significant amount of effort has been devoted to extracting +natural scene text for resourceful languages like English, little has been done +for low-resource languages like Bangla. In this research work, we have proposed +an end-to-end system with deep learning-based models for efficiently detecting, +recognizing, correcting, and parsing address information from Bangla +signboards. We have created manually annotated datasets and synthetic datasets +to train signboard detection, address text detection, address text recognition, +address text correction, and address text parser models. We have conducted a +comparative study among different CTC-based and Encoder-Decoder model +architectures for Bangla address text recognition. Moreover, we have designed a +novel address text correction model using a sequence-to-sequence +transformer-based network to improve the performance of Bangla address text +recognition model by post-correction. Finally, we have developed a Bangla +address text parser using the state-of-the-art transformer-based pre-trained +language model. + +
+
+
+
+
+ + ☆ Test-time Adaptive Vision-and-Language Navigation + + +
+ Vision-and-Language Navigation (VLN) has witnessed significant advancements +in recent years, largely attributed to meticulously curated datasets and +proficiently trained models. Nevertheless, when tested in diverse environments, +the trained models inevitably encounter significant shifts in data +distribution, highlighting that relying solely on pre-trained and fixed +navigation models is insufficient. To enhance models' generalization ability, +test-time adaptation (TTA) demonstrates significant potential in the computer +vision field by leveraging unlabeled test samples for model updates. However, +simply applying existing TTA methods to the VLN task cannot well handle the +adaptability-stability dilemma of VLN models, i.e., frequent updates can result +in drastic changes in model parameters, while occasional updates can make the +models ill-equipped to handle dynamically changing environments. Therefore, we +propose a Fast-Slow Test-Time Adaptation (FSTTA) approach for VLN by performing +decomposition-accumulation analysis for both gradients and parameters in a +unified framework. Specifically, in the fast update phase, gradients generated +during the recent multi-step navigation process are decomposed into components +with varying levels of consistency. Then, these components are adaptively +accumulated to pinpoint a concordant direction for fast model adaptation. In +the slow update phase, historically recorded parameters are gathered, and a +similar decomposition-accumulation analysis is conducted to revert the model to +a stable state. Extensive experiments show that our method obtains impressive +performance gains on four popular benchmarks. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Self-guided Few-shot Semantic Segmentation for Remote Sensing Imagery + Based on Large Vision Models + + +
+ The Segment Anything Model (SAM) exhibits remarkable versatility and +zero-shot learning abilities, owing largely to its extensive training data +(SA-1B). Recognizing SAM's dependency on manual guidance given its +category-agnostic nature, we identified unexplored potential within few-shot +semantic segmentation tasks for remote sensing imagery. This research +introduces a structured framework designed for the automation of few-shot +semantic segmentation. It utilizes the SAM model and facilitates a more +efficient generation of semantically discernible segmentation outcomes. Central +to our methodology is a novel automatic prompt learning approach, leveraging +prior guided masks to produce coarse pixel-wise prompts for SAM. Extensive +experiments on the DLRSD datasets underline the superiority of our approach, +outperforming other available few-shot methodologies. + +
+
+
+
+
+ + ☆ DRIFu: Differentiable Rendering and Implicit Function-based Single-View + 3D Reconstruction + + +
+ The Differentiable Rendering and Implicit Function-based model (DRIFu) draws +its roots from the Pixel-aligned Implicit Function (PIFU), a pioneering 3D +digitization technique initially designed for clothed human bodies. PIFU excels +in capturing nuanced body shape variations within a low-dimensional space and +has been extensively trained on human 3D scans. However, the application of +PIFU to live animals poses significant challenges, primarily due to the +inherent difficulty in obtaining the cooperation of animals for 3D scanning. In +response to this challenge, we introduce the DRIFu model, specifically tailored +for animal digitization. To train DRIFu, we employ a curated set of synthetic +3D animal models, encompassing diverse shapes, sizes, and even accounting for +variations such as baby birds. Our innovative alignment tools play a pivotal +role in mapping these diverse synthetic animal models onto a unified template, +facilitating precise predictions of animal shape and texture. Crucially, our +template alignment strategy establishes a shared shape space, allowing for the +seamless sampling of new animal shapes, posing them realistically, animating +them, and aligning them with real-world data. This groundbreaking approach +revolutionizes our capacity to comprehensively understand and represent avian +forms. For further details and access to the project, the project website can +be found at https://github.com/kuangzijian/drifu-for-animals + +
+
+ comment: arXiv admin note: text overlap with arXiv:1905.05172 by other authors +
+
+
+
+
+ + ☆ DoubleAUG: Single-domain Generalized Object Detector in Urban via Color + Perturbation and Dual-style Memory + + +
+ Object detection in urban scenarios is crucial for autonomous driving in +intelligent traffic systems. However, unlike conventional object detection +tasks, urban-scene images vary greatly in style. For example, images taken on +sunny days differ significantly from those taken on rainy days. Therefore, +models trained on sunny day images may not generalize well to rainy day images. +In this paper, we aim to solve the single-domain generalizable object detection +task in urban scenarios, meaning that a model trained on images from one +weather condition should be able to perform well on images from any other +weather conditions. To address this challenge, we propose a novel Double +AUGmentation (DoubleAUG) method that includes image- and feature-level +augmentation schemes. In the image-level augmentation, we consider the +variation in color information across different weather conditions and propose +a Color Perturbation (CP) method that randomly exchanges the RGB channels to +generate various images. In the feature-level augmentation, we propose to +utilize a Dual-Style Memory (DSM) to explore the diverse style information on +the entire dataset, further enhancing the model's generalization capability. +Extensive experiments demonstrate that our proposed method outperforms +state-of-the-art methods. Furthermore, ablation studies confirm the +effectiveness of each module in our proposed method. Moreover, our method is +plug-and-play and can be integrated into existing methods to further improve +model performance. + +
+
+ comment: Accepted by ACM Transactions on Multimedia Computing, Communications, + and Applications +
+
+
+
+
+ + ☆ Towards Improving Document Understanding: An Exploration on + Text-Grounding via MLLMs + + +
+ In the field of document understanding, significant advances have been made +in the fine-tuning of Multimodal Large Language Models (MLLMs) with +instruction-following data. Nevertheless, the potential of text-grounding +capability within text-rich scenarios remains underexplored. In this paper, we +present a text-grounding document understanding model, termed TGDoc, which +addresses this deficiency by enhancing MLLMs with the ability to discern the +spatial positioning of text within images. Empirical evidence suggests that +text-grounding improves the model's interpretation of textual content, thereby +elevating its proficiency in comprehending text-rich images. Specifically, we +compile a dataset containing 99K PowerPoint presentations sourced from the +internet. We formulate instruction tuning tasks including text detection, +recognition, and spotting to facilitate the cohesive alignment between the +visual encoder and large language model. Moreover, we curate a collection of +text-rich images and prompt the text-only GPT-4 to generate 12K high-quality +conversations, featuring textual locations within text-rich scenarios. By +integrating text location data into the instructions, TGDoc is adept at +discerning text locations during the visual question process. Extensive +experiments demonstrate that our method achieves state-of-the-art performance +across multiple text-rich benchmarks, validating the effectiveness of our +method. + +
+
+
+
+
+ + ☆ NeISF: Neural Incident Stokes Field for Geometry and Material Estimation + + +
+ Multi-view inverse rendering is the problem of estimating the scene +parameters such as shapes, materials, or illuminations from a sequence of +images captured under different viewpoints. Many approaches, however, assume +single light bounce and thus fail to recover challenging scenarios like +inter-reflections. On the other hand, simply extending those methods to +consider multi-bounced light requires more assumptions to alleviate the +ambiguity. To address this problem, we propose Neural Incident Stokes Fields +(NeISF), a multi-view inverse rendering framework that reduces ambiguities +using polarization cues. The primary motivation for using polarization cues is +that it is the accumulation of multi-bounced light, providing rich information +about geometry and material. Based on this knowledge, the proposed incident +Stokes field efficiently models the accumulated polarization effect with the +aid of an original physically-based differentiable polarimetric renderer. +Lastly, experimental results show that our method outperforms the existing +works in synthetic and real scenarios. + +
+
+
+
+
+ + ☆ Applications of Spiking Neural Networks in Visual Place Recognition + + +
+ In robotics, Spiking Neural Networks (SNNs) are increasingly recognized for +their largely-unrealized potential energy efficiency and low latency +particularly when implemented on neuromorphic hardware. Our paper highlights +three advancements for SNNs in Visual Place Recognition (VPR). First, we +propose Modular SNNs, where each SNN represents a set of non-overlapping +geographically distinct places, enabling scalable networks for large +environments. Secondly, we present Ensembles of Modular SNNs, where multiple +networks represent the same place, significantly enhancing accuracy compared to +single-network models. Our SNNs are compact and small, comprising only 1500 +neurons and 474k synapses, which makes them ideally suited for ensembling due +to this small size. Lastly, we investigate the role of sequence matching in +SNN-based VPR, a technique where consecutive images are used to refine place +recognition. We analyze the responsiveness of SNNs to ensembling and sequence +matching compared to other VPR techniques. Our contributions highlight the +viability of SNNs for VPR, offering scalable and robust solutions, paving the +way for their application in various energy-sensitive robotic tasks. + +
+
+ comment: 17 pages, 8 figures, under review +
+
+
+
+
+ + ☆ Differentiable Radio Frequency Ray Tracing for Millimeter-Wave Sensing + + +
+ Millimeter wave (mmWave) sensing is an emerging technology with applications +in 3D object characterization and environment mapping. However, realizing +precise 3D reconstruction from sparse mmWave signals remains challenging. +Existing methods rely on data-driven learning, constrained by dataset +availability and difficulty in generalization. We propose DiffSBR, a +differentiable framework for mmWave-based 3D reconstruction. DiffSBR +incorporates a differentiable ray tracing engine to simulate radar point clouds +from virtual 3D models. A gradient-based optimizer refines the model parameters +to minimize the discrepancy between simulated and real point clouds. +Experiments using various radar hardware validate DiffSBR's capability for +fine-grained 3D reconstruction, even for novel objects unseen by the radar +previously. By integrating physics-based simulation with gradient optimization, +DiffSBR transcends the limitations of data-driven approaches and pioneers a new +paradigm for mmWave sensing. + +
+
+
+
+
+ + ☆ Volumetric Reconstruction Resolves Off-Resonance Artifacts in Static and + Dynamic PROPELLER MRI + + +
+ Off-resonance artifacts in magnetic resonance imaging (MRI) are visual +distortions that occur when the actual resonant frequencies of spins within the +imaging volume differ from the expected frequencies used to encode spatial +information. These discrepancies can be caused by a variety of factors, +including magnetic field inhomogeneities, chemical shifts, or susceptibility +differences within the tissues. Such artifacts can manifest as blurring, +ghosting, or misregistration of the reconstructed image, and they often +compromise its diagnostic quality. We propose to resolve these artifacts by +lifting the 2D MRI reconstruction problem to 3D, introducing an additional +"spectral" dimension to model this off-resonance. Our approach is inspired by +recent progress in modeling radiance fields, and is capable of reconstructing +both static and dynamic MR images as well as separating fat and water, which is +of independent clinical interest. We demonstrate our approach in the context of +PROPELLER (Periodically Rotated Overlapping ParallEL Lines with Enhanced +Reconstruction) MRI acquisitions, which are popular for their robustness to +motion artifacts. Our method operates in a few minutes on a single GPU, and to +our knowledge is the first to correct for chemical shift in gradient echo +PROPELLER MRI reconstruction without additional measurements or pretraining +data. + +
+
+ comment: Code is available at + https://github.com/sarafridov/volumetric-propeller +
+
+
+
+
+ + ☆ Learning to Complement with Multiple Humans (LECOMH): Integrating + Multi-rater and Noisy-Label Learning into Human-AI Collaboration + + +
+ The advent of learning with noisy labels (LNL), multi-rater learning, and +human-AI collaboration has revolutionised the development of robust +classifiers, enabling them to address the challenges posed by different types +of data imperfections and complex decision processes commonly encountered in +real-world applications. While each of these methodologies has individually +made significant strides in addressing their unique challenges, the development +of techniques that can simultaneously tackle these three problems remains +underexplored. This paper addresses this research gap by integrating +noisy-label learning, multi-rater learning, and human-AI collaboration with new +benchmarks and the innovative Learning to Complement with Multiple Humans +(LECOMH) approach. LECOMH optimises the level of human collaboration during +testing, aiming to optimise classification accuracy while minimising +collaboration costs that vary from 0 to M, where M is the maximum number of +human collaborators. We quantitatively compare LECOMH with leading human-AI +collaboration methods using our proposed benchmarks. LECOMH consistently +outperforms the competition, with accuracy improving as collaboration costs +increase. Notably, LECOMH is the only method enhancing human labeller +performance across all benchmarks. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ 3D Face Style Transfer with a Hybrid Solution of NeRF and Mesh + Rasterization + + +
+ Style transfer for human face has been widely researched in recent years. +Majority of the existing approaches work in 2D image domain and have 3D +inconsistency issue when applied on different viewpoints of the same face. In +this paper, we tackle the problem of 3D face style transfer which aims at +generating stylized novel views of a 3D human face with multi-view consistency. +We propose to use a neural radiance field (NeRF) to represent 3D human face and +combine it with 2D style transfer to stylize the 3D face. We find that directly +training a NeRF on stylized images from 2D style transfer brings in 3D +inconsistency issue and causes blurriness. On the other hand, training a NeRF +jointly with 2D style transfer objectives shows poor convergence due to the +identity and head pose gap between style image and content image. It also poses +challenge in training time and memory due to the need of volume rendering for +full image to apply style transfer loss functions. We therefore propose a +hybrid framework of NeRF and mesh rasterization to combine the benefits of high +fidelity geometry reconstruction of NeRF and fast rendering speed of mesh. Our +framework consists of three stages: 1. Training a NeRF model on input face +images to learn the 3D geometry; 2. Extracting a mesh from the trained NeRF +model and optimizing it with style transfer objectives via differentiable +rasterization; 3. Training a new color network in NeRF conditioned on a style +embedding to enable arbitrary style transfer to the 3D face. Experiment results +show that our approach generates high quality face style transfer with great 3D +consistency, while also enabling a flexible style control. + +
+
+
+
+
+ + ☆ Test-Time Augmentation for 3D Point Cloud Classification and + Segmentation 3DV 2024 + + +
+ Data augmentation is a powerful technique to enhance the performance of a +deep learning task but has received less attention in 3D deep learning. It is +well known that when 3D shapes are sparsely represented with low point density, +the performance of the downstream tasks drops significantly. This work explores +test-time augmentation (TTA) for 3D point clouds. We are inspired by the recent +revolution of learning implicit representation and point cloud upsampling, +which can produce high-quality 3D surface reconstruction and +proximity-to-surface, respectively. Our idea is to leverage the implicit field +reconstruction or point cloud upsampling techniques as a systematic way to +augment point cloud data. Mainly, we test both strategies by sampling points +from the reconstructed results and using the sampled point cloud as test-time +augmented data. We show that both strategies are effective in improving +accuracy. We observed that point cloud upsampling for test-time augmentation +can lead to more significant performance improvement on downstream tasks such +as object classification and segmentation on the ModelNet40, ShapeNet, +ScanObjectNN, and SemanticKITTI datasets, especially for sparse point clouds. + +
+
+ comment: This paper is accepted in 3DV 2024 +
+
+
+
+
+ + ☆ Single Image Compressed Sensing MRI via a Self-Supervised Deep Denoising + Approach + + +
+ Popular methods in compressed sensing (CS) are dependent on deep learning +(DL), where large amounts of data are used to train non-linear reconstruction +models. However, ensuring generalisability over and access to multiple datasets +is challenging to realise for real-world applications. To address these +concerns, this paper proposes a single image, self-supervised (SS) CS-MRI +framework that enables a joint deep and sparse regularisation of CS artefacts. +The approach effectively dampens structured CS artefacts, which can be +difficult to remove assuming sparse reconstruction, or relying solely on the +inductive biases of CNN to produce noise-free images. Image quality is thereby +improved compared to either approach alone. Metrics are evaluated using +Cartesian 1D masks on a brain and knee dataset, with PSNR improving by 2-4dB on +average. + +
+
+ comment: 5 pages, 4 figures, 2 tables, conference +
+
+
+
+
+ + ☆ Diffusion360: Seamless 360 Degree Panoramic Image Generation based on + Diffusion Models + + +
+ This is a technical report on the 360-degree panoramic image generation task +based on diffusion models. Unlike ordinary 2D images, 360-degree panoramic +images capture the entire $360^\circ\times 180^\circ$ field of view. So the +rightmost and the leftmost sides of the 360 panoramic image should be +continued, which is the main challenge in this field. However, the current +diffusion pipeline is not appropriate for generating such a seamless 360-degree +panoramic image. To this end, we propose a circular blending strategy on both +the denoising and VAE decoding stages to maintain the geometry continuity. +Based on this, we present two models for \textbf{Text-to-360-panoramas} and +\textbf{Single-Image-to-360-panoramas} tasks. The code has been released as an +open-source project at +\href{https://github.com/ArcherFMY/SD-T2I-360PanoImage}{https://github.com/ArcherFMY/SD-T2I-360PanoImage} +and +\href{https://www.modelscope.cn/models/damo/cv_diffusion_text-to-360panorama-image_generation/summary}{ModelScope} + +
+
+ comment: 2 pages, 8 figures, Tech. Report +
+
+
+
+
+ + ☆ Lightweight High-Speed Photography Built on Coded Exposure and Implicit + Neural Representation of Videos + + +
+ The compact cameras recording high-speed scenes with high resolution are +highly demanded, but the required high bandwidth often leads to bulky, heavy +systems, which limits their applications on low-capacity platforms. Adopting a +coded exposure setup to encode a frame sequence into a blurry snapshot and +retrieve the latent sharp video afterward can serve as a lightweight solution. +However, restoring motion from blur is quite challenging due to the high +ill-posedness of motion blur decomposition, intrinsic ambiguity in motion +direction, and diverse motions in natural videos. In this work, by leveraging +classical coded exposure imaging technique and emerging implicit neural +representation for videos, we tactfully embed the motion direction cues into +the blurry image during the imaging process and develop a novel self-recursive +neural network to sequentially retrieve the latent video sequence from the +blurry image utilizing the embedded motion direction cues. To validate the +effectiveness and efficiency of the proposed framework, we conduct extensive +experiments on benchmark datasets and real-captured blurry images. The results +demonstrate that our proposed framework significantly outperforms existing +methods in quality and flexibility. The code for our work is available at +https://github.com/zhihongz/BDINR + +
+
+ comment: 19 pages, 10 figures +
+
+
+
+
+ + ☆ P2RBox: A Single Point is All You Need for Oriented Object Detection + + +
+ Oriented object detection, a specialized subfield in computer vision, finds +applications across diverse scenarios, excelling particularly when dealing with +objects of arbitrary orientations. Conversely, point annotation, which treats +objects as single points, offers a cost-effective alternative to rotated and +horizontal bounding boxes but sacrifices performance due to the loss of size +and orientation information. In this study, we introduce the P2RBox network, +which leverages point annotations and a mask generator to create mask +proposals, followed by filtration through our Inspector Module and Constrainer +Module. This process selects high-quality masks, which are subsequently +converted into rotated box annotations for training a fully supervised +detector. Specifically, we've thoughtfully crafted an Inspector Module rooted +in multi-instance learning principles to evaluate the semantic score of masks. +We've also proposed a more robust mask quality assessment in conjunction with +the Constrainer Module. Furthermore, we've introduced a Symmetry Axis +Estimation (SAE) Module inspired by the spectral theorem for symmetric matrices +to transform the top-performing mask proposal into rotated bounding boxes. +P2RBox performs well with three fully supervised rotated object detectors: +RetinaNet, Rotated FCOS, and Oriented R-CNN. By combining with Oriented R-CNN, +P2RBox achieves 62.26% on DOTA-v1.0 test dataset. As far as we know, this is +the first attempt at training an oriented object detector with point +supervision. + +
+
+
+
+
+ + ☆ Toward Robust Imperceptible Perturbation against Unauthorized + Text-to-image Diffusion-based Synthesis + + +
+ Text-to-image diffusion models allow seamless generation of personalized +images from scant reference photos. Yet, these tools, in the wrong hands, can +fabricate misleading or harmful content, endangering individuals. To address +this problem, existing poisoning-based approaches perturb user images in an +imperceptible way to render them "unlearnable" from malicious uses. We identify +two limitations of these defending approaches: i) sub-optimal due to the +hand-crafted heuristics for solving the intractable bilevel optimization and +ii) lack of robustness against simple data transformations like Gaussian +filtering. To solve these challenges, we propose MetaCloak, which solves the +bi-level poisoning problem with a meta-learning framework with an additional +transformation sampling process to craft transferable and robust perturbation. +Specifically, we employ a pool of surrogate diffusion models to craft +transferable and model-agnostic perturbation. Furthermore, by incorporating an +additional transformation process, we design a simple denoising-error +maximization loss that is sufficient for causing transformation-robust semantic +distortion and degradation in a personalized generation. Extensive experiments +on the VGGFace2 and CelebA-HQ datasets show that MetaCloak outperforms existing +approaches. Notably, MetaCloak can successfully fool online training services +like Replicate, in a black-box manner, demonstrating the effectiveness of +MetaCloak in real-world scenarios. Our code is available at +https://github.com/liuyixin-louis/MetaCloak. + +
+
+ comment: 26 pages, 15 figures, 8 tables +
+
+
+
+
+ + ☆ DAE-Net: Deforming Auto-Encoder for fine-grained shape co-segmentation + + +
+ We present an unsupervised 3D shape co-segmentation method which learns a set +of deformable part templates from a shape collection. To accommodate structural +variations in the collection, our network composes each shape by a selected +subset of template parts which are affine-transformed. To maximize the +expressive power of the part templates, we introduce a per-part deformation +network to enable the modeling of diverse parts with substantial geometry +variations, while imposing constraints on the deformation capacity to ensure +fidelity to the originally represented parts. We also propose a training scheme +to effectively overcome local minima. Architecturally, our network is a +branched autoencoder, with a CNN encoder taking a voxel shape as input and +producing per-part transformation matrices, latent codes, and part existence +scores, and the decoder outputting point occupancies to define the +reconstruction loss. Our network, coined DAE-Net for Deforming Auto-Encoder, +can achieve unsupervised 3D shape co-segmentation that yields fine-grained, +compact, and meaningful parts that are consistent across diverse shapes. We +conduct extensive experiments on the ShapeNet Part dataset, DFAUST, and an +animal subset of Objaverse to show superior performance over prior methods. + +
+
+ comment: Code: https://github.com/czq142857/DAE-Net +
+
+
+
+
+ + ☆ Multi-modal In-Context Learning Makes an Ego-evolving Scene Text + Recognizer + + +
+ Scene text recognition (STR) in the wild frequently encounters challenges +when coping with domain variations, font diversity, shape deformations, etc. A +straightforward solution is performing model fine-tuning tailored to a specific +scenario, but it is computationally intensive and requires multiple model +copies for various scenarios. Recent studies indicate that large language +models (LLMs) can learn from a few demonstration examples in a training-free +manner, termed "In-Context Learning" (ICL). Nevertheless, applying LLMs as a +text recognizer is unacceptably resource-consuming. Moreover, our pilot +experiments on LLMs show that ICL fails in STR, mainly attributed to the +insufficient incorporation of contextual information from diverse samples in +the training stage. To this end, we introduce E$^2$STR, a STR model trained +with context-rich scene text sequences, where the sequences are generated via +our proposed in-context training strategy. E$^2$STR demonstrates that a +regular-sized model is sufficient to achieve effective ICL capabilities in STR. +Extensive experiments show that E$^2$STR exhibits remarkable training-free +adaptation in various scenarios and outperforms even the fine-tuned +state-of-the-art approaches on public benchmarks. + +
+
+
+
+
+ + ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: This paper integrates the works arXiv:2306.01129 and + arXiv:2308.16271, as well as this under-review work: + https://openreview.net/forum?id=PvyOYleymy into a complete story. In this + paper, we improve the writing and organization, and also add conceptual, + empirical, and theoretical improvements over the previous work +
+
+
+
+
+ + ☆ Automated Measurement of Pericoronary Adipose Tissue Attenuation and + Volume in CT Angiography + + +
+ Pericoronary adipose tissue (PCAT) is the deposition of fat in the vicinity +of the coronary arteries. It is an indicator of coronary inflammation and +associated with coronary artery disease. Non-invasive coronary CT angiography +(CCTA) is presently used to obtain measures of the thickness, volume, and +attenuation of fat deposition. However, prior works solely focus on measuring +PCAT using semi-automated approaches at the right coronary artery (RCA) over +the left coronary artery (LCA). In this pilot work, we developed a fully +automated approach for the measurement of PCAT mean attenuation and volume in +the region around both coronary arteries. First, we used a large subset of +patients from the public ImageCAS dataset (n = 735) to train a 3D full +resolution nnUNet to segment LCA and RCA. Then, we automatically measured PCAT +in the surrounding arterial regions. We evaluated our method on a held-out test +set of patients (n = 183) from the same dataset. A mean Dice score of 83% and +PCAT attenuation of -73.81 $\pm$ 12.69 HU was calculated for the RCA, while a +mean Dice score of 81% and PCAT attenuation of -77.51 $\pm$ 7.94 HU was +computed for the LCA. To the best of our knowledge, we are the first to develop +a fully automated method to measure PCAT attenuation and volume at both the RCA +and LCA. Our work underscores how automated PCAT measurement holds promise as a +biomarker for identification of inflammation and cardiac disease. + +
+
+ comment: 5 pages, 4 figures, IEE ISBI2024 conference +
+
+
+
+
+ + ☆ PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF + + +
+ We show that physics-based simulations can be seamlessly integrated with NeRF +to generate high-quality elastodynamics of real-world objects. Unlike existing +methods, we discretize nonlinear hyperelasticity in a meshless way, obviating +the necessity for intermediate auxiliary shape proxies like a tetrahedral mesh +or voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed +to capture nonlinear dynamics and large deformation on the implicit model. Such +meshless integration enables versatile simulations of complex and codimensional +shapes. We adaptively place the least-square kernels according to the NeRF +density field to significantly reduce the complexity of the nonlinear +simulation. As a result, physically realistic animations can be conveniently +synthesized using our method for a wide range of hyperelastic materials at an +interactive rate. For more information, please visit our project page at +https://fytalon.github.io/pienerf/. + +
+
+
+
+
+ + ☆ Stable Unlearnable Example: Enhancing the Robustness of Unlearnable + Examples via Stable Error-Minimizing Noise + + +
+ The open source of large amounts of image data promotes the development of +deep learning techniques. Along with this comes the privacy risk of these +open-source image datasets being exploited by unauthorized third parties to +train deep learning models for commercial or illegal purposes. To avoid the +abuse of public data, a poisoning-based technique, the unlearnable example, is +proposed to significantly degrade the generalization performance of models by +adding a kind of imperceptible noise to the data. To further enhance its +robustness against adversarial training, existing works leverage iterative +adversarial training on both the defensive noise and the surrogate model. +However, it still remains unknown whether the robustness of unlearnable +examples primarily comes from the effect of enhancement in the surrogate model +or the defensive noise. Observing that simply removing the adversarial noise on +the training process of the defensive noise can improve the performance of +robust unlearnable examples, we identify that solely the surrogate model's +robustness contributes to the performance. Furthermore, we found a negative +correlation exists between the robustness of defensive noise and the protection +performance, indicating defensive noise's instability issue. Motivated by this, +to further boost the robust unlearnable example, we introduce stable +error-minimizing noise (SEM), which trains the defensive noise against random +perturbation instead of the time-consuming adversarial perturbation to improve +the stability of defensive noise. Through extensive experiments, we demonstrate +that SEM achieves a new state-of-the-art performance on CIFAR-10, CIFAR-100, +and ImageNet Subset in terms of both effectiveness and efficiency. The code is +available at https://github.com/liuyixin-louis/Stable-Unlearnable-Example. + +
+
+ comment: 14 pages, 11 figures, 13 tables +
+
+
+
+
+ + ☆ On the Limitation of Diffusion Models for Synthesizing Training Datasets NeurIPS 2023 + + +
+ Synthetic samples from diffusion models are promising for leveraging in +training discriminative models as replications of real training datasets. +However, we found that the synthetic datasets degrade classification +performance over real datasets even when using state-of-the-art diffusion +models. This means that modern diffusion models do not perfectly represent the +data distribution for the purpose of replicating datasets for training +discriminative tasks. This paper investigates the gap between synthetic and +real samples by analyzing the synthetic samples reconstructed from real samples +through the diffusion and reverse process. By varying the time steps starting +the reverse process in the reconstruction, we can control the trade-off between +the information in the original real data and the information added by +diffusion models. Through assessing the reconstructed samples and trained +models, we found that the synthetic data are concentrated in modes of the +training data distribution as the reverse step increases, and thus, they are +difficult to cover the outer edges of the distribution. Our findings imply that +modern diffusion models are insufficient to replicate training data +distribution perfectly, and there is room for the improvement of generative +modeling in the replication of training datasets. + +
+
+ comment: NeurIPS 2023 SyntheticData4ML Workshop +
+
+
+
+
+ + ☆ FusionFrames: Efficient Architectural Aspects for Text-to-Video + Generation Pipeline + + +
+ Multimedia generation approaches occupy a prominent place in artificial +intelligence research. Text-to-image models achieved high-quality results over +the last few years. However, video synthesis methods recently started to +develop. This paper presents a new two-stage latent diffusion text-to-video +generation architecture based on the text-to-image diffusion model. The first +stage concerns keyframes synthesis to figure the storyline of a video, while +the second one is devoted to interpolation frames generation to make movements +of the scene and objects smooth. We compare several temporal conditioning +approaches for keyframes generation. The results show the advantage of using +separate temporal blocks over temporal layers in terms of metrics reflecting +video generation quality aspects and human preference. The design of our +interpolation model significantly reduces computational costs compared to other +masked frame interpolation approaches. Furthermore, we evaluate different +configurations of MoVQ-based video decoding scheme to improve consistency and +achieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our +pipeline with existing solutions and achieve top-2 scores overall and top-1 +among open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page: +https://ai-forever.github.io/kandinsky-video/ + +
+
+ comment: Project page: https://ai-forever.github.io/kandinsky-video/ +
+
+
+
+
+ + ☆ FuseNet: Self-Supervised Dual-Path Network for Medical Image + Segmentation + + +
+ Semantic segmentation, a crucial task in computer vision, often relies on +labor-intensive and costly annotated datasets for training. In response to this +challenge, we introduce FuseNet, a dual-stream framework for self-supervised +semantic segmentation that eliminates the need for manual annotation. FuseNet +leverages the shared semantic dependencies between the original and augmented +images to create a clustering space, effectively assigning pixels to +semantically related clusters, and ultimately generating the segmentation map. +Additionally, FuseNet incorporates a cross-modal fusion technique that extends +the principles of CLIP by replacing textual data with augmented images. This +approach enables the model to learn complex visual representations, enhancing +robustness against variations similar to CLIP's text invariance. To further +improve edge alignment and spatial consistency between neighboring pixels, we +introduce an edge refinement loss. This loss function considers edge +information to enhance spatial coherence, facilitating the grouping of nearby +pixels with similar visual features. Extensive experiments on skin lesion and +lung segmentation datasets demonstrate the effectiveness of our method. +\href{https://github.com/xmindflow/FuseNet}{Codebase.} + +
+
+
+
+
+ + ♻ ☆ Investigating Weight-Perturbed Deep Neural Networks With Application in + Iris Presentation Attack Detection + + +
+ Deep neural networks (DNNs) exhibit superior performance in various machine +learning tasks, e.g., image classification, speech recognition, biometric +recognition, object detection, etc. However, it is essential to analyze their +sensitivity to parameter perturbations before deploying them in real-world +applications. In this work, we assess the sensitivity of DNNs against +perturbations to their weight and bias parameters. The sensitivity analysis +involves three DNN architectures (VGG, ResNet, and DenseNet), three types of +parameter perturbations (Gaussian noise, weight zeroing, and weight scaling), +and two settings (entire network and layer-wise). We perform experiments in the +context of iris presentation attack detection and evaluate on two publicly +available datasets: LivDet-Iris-2017 and LivDet-Iris-2020. Based on the +sensitivity analysis, we propose improved models simply by perturbing +parameters of the network without undergoing training. We further combine these +perturbed models at the score-level and at the parameter-level to improve the +performance over the original model. The ensemble at the parameter-level shows +an average improvement of 43.58% on the LivDet-Iris-2017 dataset and 9.25% on +the LivDet-Iris-2020 dataset. The source code is available at +https://github.com/redwankarimsony/WeightPerturbation-MSU. + +
+
+
+
+
+ + ♻ ☆ A Good Feature Extractor Is All You Need for Weakly Supervised Learning + in Histopathology + + +
+ Deep learning is revolutionising pathology, offering novel opportunities in +disease prognosis and personalised treatment. Historically, stain normalisation +has been a crucial preprocessing step in computational pathology pipelines, and +persists into the deep learning era. Yet, with the emergence of feature +extractors trained using self-supervised learning (SSL) on diverse pathology +datasets, we call this practice into question. In an empirical evaluation of +publicly available feature extractors, we find that omitting stain +normalisation and image augmentations does not compromise downstream +performance, while incurring substantial savings in memory and compute. +Further, we show that the top-performing feature extractors are remarkably +robust to variations in stain and augmentations like rotation in their latent +space. Contrary to previous patch-level benchmarking studies, our approach +emphasises clinical relevance by focusing on slide-level prediction tasks in a +weakly supervised setting with external validation cohorts. This work +represents the most comprehensive robustness evaluation of public pathology SSL +feature extractors to date, involving more than 6,000 training runs across nine +tasks, five datasets, three downstream architectures, and various preprocessing +setups. Our findings stand to streamline digital pathology workflows by +minimising preprocessing needs and informing the selection of feature +extractors. + +
+
+
+
+
+ + ♻ ☆ LucidDreamer: Towards High-Fidelity Text-to-3D Generation via Interval + Score Matching + + +
+ The recent advancements in text-to-3D generation mark a significant milestone +in generative models, unlocking new possibilities for creating imaginative 3D +assets across various real-world scenarios. While recent advancements in +text-to-3D generation have shown promise, they often fall short in rendering +detailed and high-quality 3D models. This problem is especially prevalent as +many methods base themselves on Score Distillation Sampling (SDS). This paper +identifies a notable deficiency in SDS, that it brings inconsistent and +low-quality updating direction for the 3D model, causing the over-smoothing +effect. To address this, we propose a novel approach called Interval Score +Matching (ISM). ISM employs deterministic diffusing trajectories and utilizes +interval-based score matching to counteract over-smoothing. Furthermore, we +incorporate 3D Gaussian Splatting into our text-to-3D generation pipeline. +Extensive experiments show that our model largely outperforms the +state-of-the-art in quality and training efficiency. + +
+
+ comment: The first two authors contributed equally to this work. Our code will + be available at: https://github.com/EnVision-Research/LucidDreamer +
+
+
+
+
+ + ♻ ☆ Adversarial Backdoor Attack by Naturalistic Data Poisoning on Trajectory + Prediction in Autonomous Driving + + +
+ In autonomous driving, behavior prediction is fundamental for safe motion +planning, hence the security and robustness of prediction models against +adversarial attacks are of paramount importance. We propose a novel adversarial +backdoor attack against trajectory prediction models as a means of studying +their potential vulnerabilities. Our attack affects the victim at training time +via naturalistic, hence stealthy, poisoned samples crafted using a novel +two-step approach. First, the triggers are crafted by perturbing the trajectory +of attacking vehicle and then disguised by transforming the scene using a +bi-level optimization technique. The proposed attack does not depend on a +particular model architecture and operates in a black-box manner, thus can be +effective without any knowledge of the victim model. We conduct extensive +empirical studies using state-of-the-art prediction models on two benchmark +datasets using metrics customized for trajectory prediction. We show that the +proposed attack is highly effective, as it can significantly hinder the +performance of prediction models, unnoticeable by the victims, and efficient as +it forces the victim to generate malicious behavior even under constrained +conditions. Via ablative studies, we analyze the impact of different attack +design choices followed by an evaluation of existing defence mechanisms against +the proposed attack. + +
+
+
+
+
+ + ♻ ☆ Pelvic floor MRI segmentation based on semi-supervised deep learning + + +
+ The semantic segmentation of pelvic organs via MRI has important clinical +significance. Recently, deep learning-enabled semantic segmentation has +facilitated the three-dimensional geometric reconstruction of pelvic floor +organs, providing clinicians with accurate and intuitive diagnostic results. +However, the task of labeling pelvic floor MRI segmentation, typically +performed by clinicians, is labor-intensive and costly, leading to a scarcity +of labels. Insufficient segmentation labels limit the precise segmentation and +reconstruction of pelvic floor organs. To address these issues, we propose a +semi-supervised framework for pelvic organ segmentation. The implementation of +this framework comprises two stages. In the first stage, it performs +self-supervised pre-training using image restoration tasks. Subsequently, +fine-tuning of the self-supervised model is performed, using labeled data to +train the segmentation model. In the second stage, the self-supervised +segmentation model is used to generate pseudo labels for unlabeled data. +Ultimately, both labeled and unlabeled data are utilized in semi-supervised +training. Upon evaluation, our method significantly enhances the performance in +the semantic segmentation and geometric reconstruction of pelvic organs, Dice +coefficient can increase by 2.65% averagely. Especially for organs that are +difficult to segment, such as the uterus, the accuracy of semantic segmentation +can be improved by up to 3.70%. + +
+
+
+
+
+ + ♻ ☆ Attention-based Adversarial Appearance Learning of Augmented Pedestrians + + +
+ Synthetic data became already an essential component of machine +learning-based perception in the field of autonomous driving. Yet it still +cannot replace real data completely due to the sim2real domain shift. In this +work, we propose a method that leverages the advantages of the augmentation +process and adversarial training to synthesize realistic data for the +pedestrian recognition task. Our approach utilizes an attention mechanism +driven by an adversarial loss to learn domain discrepancies and improve +sim2real adaptation. Our experiments confirm that the proposed adaptation +method is robust to such discrepancies and reveals both visual realism and +semantic consistency. Furthermore, we evaluate our data generation pipeline on +the task of pedestrian recognition and demonstrate that generated data resemble +properties of the real domain. + +
+
+
+
+
+ + ♻ ☆ Leveraging Different Learning Styles for Improved Knowledge Distillation + in Biomedical Imaging + + +
+ Learning style refers to a type of training mechanism adopted by an +individual to gain new knowledge. As suggested by the VARK model, humans have +different learning preferences, like Visual (V), Auditory (A), Read/Write (R), +and Kinesthetic (K), for acquiring and effectively processing information. Our +work endeavors to leverage this concept of knowledge diversification to improve +the performance of model compression techniques like Knowledge Distillation +(KD) and Mutual Learning (ML). Consequently, we use a single-teacher and +two-student network in a unified framework that not only allows for the +transfer of knowledge from teacher to students (KD) but also encourages +collaborative learning between students (ML). Unlike the conventional approach, +where the teacher shares the same knowledge in the form of predictions or +feature representations with the student network, our proposed approach employs +a more diversified strategy by training one student with predictions and the +other with feature maps from the teacher. We further extend this knowledge +diversification by facilitating the exchange of predictions and feature maps +between the two student networks, enriching their learning experiences. We have +conducted comprehensive experiments with three benchmark datasets for both +classification and segmentation tasks using two different network architecture +combinations. These experimental results demonstrate that knowledge +diversification in a combined KD and ML framework outperforms conventional KD +or ML techniques (with similar network configuration) that only use predictions +with an average improvement of 2%. Furthermore, consistent improvement in +performance across different tasks, with various network architectures, and +over state-of-the-art techniques establishes the robustness and +generalizability of the proposed model + +
+
+ comment: Accepted in Computers in Biology and Medicine +
+
+
+
+
+ + ♻ ☆ Confident Naturalness Explanation (CNE): A Framework to Explain and + Assess Patterns Forming Naturalness + + +
+ Protected natural areas are regions that have been minimally affected by +human activities such as urbanization, agriculture, and other human +interventions. To better understand and map the naturalness of these areas, +machine learning models can be used to analyze satellite imagery. Specifically, +explainable machine learning methods show promise in uncovering patterns that +contribute to the concept of naturalness within these protected environments. +Additionally, addressing the uncertainty inherent in machine learning models is +crucial for a comprehensive understanding of this concept. However, existing +approaches have limitations. They either fail to provide explanations that are +both valid and objective or struggle to offer a quantitative metric that +accurately measures the contribution of specific patterns to naturalness, along +with the associated confidence. In this paper, we propose a novel framework +called the Confident Naturalness Explanation (CNE) framework. This framework +combines explainable machine learning and uncertainty quantification to assess +and explain naturalness. We introduce a new quantitative metric that describes +the confident contribution of patterns to the concept of naturalness. +Furthermore, we generate an uncertainty-aware segmentation mask for each input +sample, highlighting areas where the model lacks knowledge. To demonstrate the +effectiveness of our framework, we apply it to a study site in Fennoscandia +using two open-source satellite datasets. + +
+
+
+
+
+ + ♻ ☆ BEVTrack: A Simple and Strong Baseline for 3D Single Object Tracking in + Bird's-Eye View + + +
+ 3D Single Object Tracking (SOT) is a fundamental task of computer vision, +proving essential for applications like autonomous driving. It remains +challenging to localize the target from surroundings due to appearance +variations, distractors, and the high sparsity of point clouds. The spatial +information indicating objects' spatial adjacency across consecutive frames is +crucial for effective object tracking. However, existing trackers typically +employ point-wise representation with irregular formats, leading to +insufficient use of this important spatial knowledge. As a result, these +trackers usually require elaborate designs and solving multiple subtasks. In +this paper, we propose BEVTrack, a simple yet effective baseline that performs +tracking in Bird's-Eye View (BEV). This representation greatly retains spatial +information owing to its ordered structure and inherently encodes the implicit +motion relations of the target as well as distractors. To achieve accurate +regression for targets with diverse attributes (\textit{e.g.}, sizes and motion +patterns), BEVTrack constructs the likelihood function with the learned +underlying distributions adapted to different targets, rather than making a +fixed Laplace or Gaussian assumption as in previous works. This provides +valuable priors for tracking and thus further boosts performance. While only +using a single regression loss with a plain convolutional architecture, +BEVTrack achieves state-of-the-art performance on three large-scale datasets, +KITTI, NuScenes, and Waymo Open Dataset while maintaining a high inference +speed of about 200 FPS. The code will be released at +https://github.com/xmm-prio/BEVTrack. + +
+
+ comment: The code will be released at https://github.com/xmm-prio/BEVTrack +
+
+
+
+
+ + ♻ ☆ Discrete approximations of Gaussian smoothing and Gaussian derivatives + + +
+ This paper develops an in-depth treatment concerning the problem of +approximating the Gaussian smoothing and Gaussian derivative computations in +scale-space theory for application on discrete data. With close connections to +previous axiomatic treatments of continuous and discrete scale-space theory, we +consider three main ways discretizing these scale-space operations in terms of +explicit discrete convolutions, based on either (i) sampling the Gaussian +kernels and the Gaussian derivative kernels, (ii) locally integrating the +Gaussian kernels and the Gaussian derivative kernels over each pixel support +region and (iii) basing the scale-space analysis on the discrete analogue of +the Gaussian kernel, and then computing derivative approximations by applying +small-support central difference operators to the spatially smoothed image +data. + We study the properties of these three main discretization methods both +theoretically and experimentally, and characterize their performance by +quantitative measures, including the results they give rise to with respect to +the task of scale selection, investigated for four different use cases, and +with emphasis on the behaviour at fine scales. The results show that the +sampled Gaussian kernels and derivatives as well as the integrated Gaussian +kernels and derivatives perform very poorly at very fine scales. At very fine +scales, the discrete analogue of the Gaussian kernel with its corresponding +discrete derivative approximations performs substantially better. The sampled +Gaussian kernel and the sampled Gaussian derivatives do, on the other hand, +lead to numerically very good approximations of the corresponding continuous +results, when the scale parameter is sufficiently large, in the experiments +presented in the paper, when the scale parameter is greater than a value of +about 1, in units of the grid spacing. + +
+
+ comment: 38 pages, 34 figures +
+
+
+
+
+ + ♻ ☆ Efficient Vision Transformer for Human Pose Estimation via Patch + Selection BMVC 2023 + + +
+ While Convolutional Neural Networks (CNNs) have been widely successful in 2D +human pose estimation, Vision Transformers (ViTs) have emerged as a promising +alternative to CNNs, boosting state-of-the-art performance. However, the +quadratic computational complexity of ViTs has limited their applicability for +processing high-resolution images. In this paper, we propose three methods for +reducing ViT's computational complexity, which are based on selecting and +processing a small number of most informative patches while disregarding +others. The first two methods leverage a lightweight pose estimation network to +guide the patch selection process, while the third method utilizes a set of +learnable joint tokens to ensure that the selected patches contain the most +important information about body joints. Experiments across six benchmarks show +that our proposed methods achieve a significant reduction in computational +complexity, ranging from 30% to 44%, with only a minimal drop in accuracy +between 0% and 3.5%. + +
+
+ comment: BMVC 2023 Oral Paper: https://proceedings.bmvc2023.org/167/ +
+
+
+
+
+ + ♻ ☆ Looking at the posterior: accuracy and uncertainty of neural-network + predictions + + +
+ Bayesian inference can quantify uncertainty in the predictions of neural +networks using posterior distributions for model parameters and network output. +By looking at these posterior distributions, one can separate the origin of +uncertainty into aleatoric and epistemic contributions. One goal of uncertainty +quantification is to inform on prediction accuracy. Here we show that +prediction accuracy depends on both epistemic and aleatoric uncertainty in an +intricate fashion that cannot be understood in terms of marginalized +uncertainty distributions alone. How the accuracy relates to epistemic and +aleatoric uncertainties depends not only on the model architecture, but also on +the properties of the dataset. We discuss the significance of these results for +active learning and introduce a novel acquisition function that outperforms +common uncertainty-based methods. To arrive at our results, we approximated the +posteriors using deep ensembles, for fully-connected, convolutional and +attention-based neural networks. + +
+
+ comment: 26 pages, 10 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Learning Site-specific Styles for Multi-institutional Unsupervised + Cross-modality Domain Adaptation + + +
+ Unsupervised cross-modality domain adaptation is a challenging task in +medical image analysis, and it becomes more challenging when source and target +domain data are collected from multiple institutions. In this paper, we present +our solution to tackle the multi-institutional unsupervised domain adaptation +for the crossMoDA 2023 challenge. First, we perform unpaired image translation +to translate the source domain images to the target domain, where we design a +dynamic network to generate synthetic target domain images with controllable, +site-specific styles. Afterwards, we train a segmentation model using the +synthetic images and further reduce the domain gap by self-training. Our +solution achieved the 1st place during both the validation and testing phases +of the challenge. The code repository is publicly available at +https://github.com/MedICL-VU/crossmoda2023. + +
+
+ comment: crossMoDA 2023 challenge 1st place solution +
+
+
+
+
+ + ♻ ☆ Deep Rank-Consistent Pyramid Model for Enhanced Crowd Counting + + +
+ Most conventional crowd counting methods utilize a fully-supervised learning +framework to establish a mapping between scene images and crowd density maps. +They usually rely on a large quantity of costly and time-intensive pixel-level +annotations for training supervision. One way to mitigate the intensive +labeling effort and improve counting accuracy is to leverage large amounts of +unlabeled images. This is attributed to the inherent self-structural +information and rank consistency within a single image, offering additional +qualitative relation supervision during training. Contrary to earlier methods +that utilized the rank relations at the original image level, we explore such +rank-consistency relation within the latent feature spaces. This approach +enables the incorporation of numerous pyramid partial orders, strengthening the +model representation capability. A notable advantage is that it can also +increase the utilization ratio of unlabeled samples. Specifically, we propose a +Deep Rank-consistEnt pyrAmid Model (DREAM), which makes full use of rank +consistency across coarse-to-fine pyramid features in latent spaces for +enhanced crowd counting with massive unlabeled images. In addition, we have +collected a new unlabeled crowd counting dataset, FUDAN-UCC, comprising 4,000 +images for training purposes. Extensive experiments on four benchmark datasets, +namely UCF-QNRF, ShanghaiTech PartA and PartB, and UCF-CC-50, show the +effectiveness of our method compared with previous semi-supervised methods. The +codes are available at https://github.com/bridgeqiqi/DREAM. + +
+
+ comment: Accepted by IEEE Transactions on Neural Networks and Learning Systems +
+
+
+
+
+ + ♻ ☆ pSTarC: Pseudo Source Guided Target Clustering for Fully Test-Time + Adaptation WACV 2024 + + +
+ Test Time Adaptation (TTA) is a pivotal concept in machine learning, enabling +models to perform well in real-world scenarios, where test data distribution +differs from training. In this work, we propose a novel approach called pseudo +Source guided Target Clustering (pSTarC) addressing the relatively unexplored +area of TTA under real-world domain shifts. This method draws inspiration from +target clustering techniques and exploits the source classifier for generating +pseudo-source samples. The test samples are strategically aligned with these +pseudo-source samples, facilitating their clustering and thereby enhancing TTA +performance. pSTarC operates solely within the fully test-time adaptation +protocol, removing the need for actual source data. Experimental validation on +a variety of domain shift datasets, namely VisDA, Office-Home, DomainNet-126, +CIFAR-100C verifies pSTarC's effectiveness. This method exhibits significant +improvements in prediction accuracy along with efficient computational +requirements. Furthermore, we also demonstrate the universality of the pSTarC +framework by showing its effectiveness for the continuous TTA framework. The +source code for our method is available at https://manogna-s.github.io/pstarc + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ USL-Net: Uncertainty Self-Learning Network for Unsupervised Skin Lesion + Segmentation + + +
+ Unsupervised skin lesion segmentation offers several benefits, including +conserving expert human resources, reducing discrepancies due to subjective +human labeling, and adapting to novel environments. However, segmenting +dermoscopic images without manual labeling guidance presents significant +challenges due to dermoscopic image artifacts such as hair noise, blister +noise, and subtle edge differences. To address these challenges, we introduce +an innovative Uncertainty Self-Learning Network (USL-Net) designed for skin +lesion segmentation. The USL-Net can effectively segment a range of lesions, +eliminating the need for manual labeling guidance. Initially, features are +extracted using contrastive learning, followed by the generation of Class +Activation Maps (CAMs) as saliency maps using these features. The different CAM +locations correspond to the importance of the lesion region based on their +saliency. High-saliency regions in the map serve as pseudo-labels for lesion +regions while low-saliency regions represent the background. However, +intermediate regions can be hard to classify, often due to their proximity to +lesion edges or interference from hair or blisters. Rather than risk potential +pseudo-labeling errors or learning confusion by forcefully classifying these +regions, we consider them as uncertainty regions, exempting them from +pseudo-labeling and allowing the network to self-learn. Further, we employ +connectivity detection and centrality detection to refine foreground +pseudo-labels and reduce noise-induced errors. The application of cycle +refining enhances performance further. Our method underwent thorough +experimental validation on the ISIC-2017, ISIC-2018, and PH2 datasets, +demonstrating that its performance is on par with weakly supervised and +supervised methods, and exceeds that of other existing unsupervised methods. + +
+
+ comment: 14 pages, 9 figures, 71 references +
+
+
+
+
+ + ♻ ☆ PsyMo: A Dataset for Estimating Self-Reported Psychological Traits from + Gait WACV + + +
+ Psychological trait estimation from external factors such as movement and +appearance is a challenging and long-standing problem in psychology, and is +principally based on the psychological theory of embodiment. To date, attempts +to tackle this problem have utilized private small-scale datasets with +intrusive body-attached sensors. Potential applications of an automated system +for psychological trait estimation include estimation of occupational fatigue +and psychology, and marketing and advertisement. In this work, we propose PsyMo +(Psychological traits from Motion), a novel, multi-purpose and multi-modal +dataset for exploring psychological cues manifested in walking patterns. We +gathered walking sequences from 312 subjects in 7 different walking variations +and 6 camera angles. In conjunction with walking sequences, participants filled +in 6 psychological questionnaires, totalling 17 psychometric attributes related +to personality, self-esteem, fatigue, aggressiveness and mental health. We +propose two evaluation protocols for psychological trait estimation. Alongside +the estimation of self-reported psychological traits from gait, the dataset can +be used as a drop-in replacement to benchmark methods for gait recognition. We +anonymize all cues related to the identity of the subjects and publicly release +only silhouettes, 2D / 3D human skeletons and 3D SMPL human meshes. + +
+
+ comment: Accepted at 2024 IEEE/CVF Winter Conference on Applications of + Computer Vision (WACV) +
+
+
+
+
+ + ♻ ☆ Hand-Eye Calibration + + +
+ Whenever a sensor is mounted on a robot hand it is important to know the +relationship between the sensor and the hand. The problem of determining this +relationship is referred to as hand-eye calibration, which is important in at +least two types of tasks: (i) map sensor centered measurements into the robot +workspace and (ii) allow the robot to precisely move the sensor. In the past +some solutions were proposed in the particular case of a camera. With almost no +exception, all existing solutions attempt to solve the homogeneous matrix +equation AX=XB. First we show that there are two possible formulations of the +hand-eye calibration problem. One formulation is the classical one that we just +mentioned. A second formulation takes the form of the following homogeneous +matrix equation: MY=M'YB. The advantage of the latter is that the extrinsic and +intrinsic camera parameters need not be made explicit. Indeed, this formulation +directly uses the 3 by 4 perspective matrices (M and M') associated with two +positions of the camera. Moreover, this formulation together with the classical +one cover a wider range of camera-based sensors to be calibrated with respect +to the robot hand. Second, we develop a common mathematical framework to solve +for the hand-eye calibration problem using either of the two formulations. We +present two methods, (i) a rotation then translation and (ii) a non-linear +solver for rotation and translation. Third, we perform a stability analysis +both for our two methods and for the classical linear method of Tsai and Lenz +(1989). In the light of this comparison, the non-linear optimization method, +that solves for rotation and translation simultaneously, seems to be the most +robust one with respect to noise and to measurement errors. + +
+
+
+
+
+ + ♻ ☆ Symphonize 3D Semantic Scene Completion with Contextual Instance Queries + + +
+ `3D Semantic Scene Completion (SSC) has emerged as a nascent and pivotal +undertaking in autonomous driving, aiming to predict voxel occupancy within +volumetric scenes. However, prevailing methodologies primarily focus on +voxel-wise feature aggregation, while neglecting instance semantics and scene +context. In this paper, we present a novel paradigm termed Symphonies +(Scene-from-Insts), that delves into the integration of instance queries to +orchestrate 2D-to-3D reconstruction and 3D scene modeling. Leveraging our +proposed Serial Instance-Propagated Attentions, Symphonies dynamically encodes +instance-centric semantics, facilitating intricate interactions between +image-based and volumetric domains. Simultaneously, Symphonies enables holistic +scene comprehension by capturing context through the efficient fusion of +instance queries, alleviating geometric ambiguity such as occlusion and +perspective errors through contextual scene reasoning. Experimental results +demonstrate that Symphonies achieves state-of-the-art performance on +challenging benchmarks SemanticKITTI and SSCBench-KITTI-360, yielding +remarkable mIoU scores of 15.04 and 18.58, respectively. These results showcase +the paradigm's promising advancements. The code is available at +https://github.com/hustvl/Symphonies. + +
+
+ comment: Technical report. Code and models at: + https://github.com/hustvl/Symphonies +
+
+
+
+
+ + ♻ ☆ CLIP Guided Image-perceptive Prompt Learning for Image Enhancement + + +
+ Image enhancement is a significant research area in the fields of computer +vision and image processing. In recent years, many learning-based methods for +image enhancement have been developed, where the Look-up-table (LUT) has proven +to be an effective tool. In this paper, we delve into the potential of +Contrastive Language-Image Pre-Training (CLIP) Guided Prompt Learning, +proposing a simple structure called CLIP-LUT for image enhancement. We found +that the prior knowledge of CLIP can effectively discern the quality of +degraded images, which can provide reliable guidance. To be specific, We +initially learn image-perceptive prompts to distinguish between original and +target images using CLIP model, in the meanwhile, we introduce a very simple +network by incorporating a simple baseline to predict the weights of three +different LUT as enhancement network. The obtained prompts are used to steer +the enhancement network like a loss function and improve the performance of +model. We demonstrate that by simply combining a straightforward method with +CLIP, we can obtain satisfactory results. + +
+
+ comment: A trial work to the image enhancement +
+
+
+
+
+ + ♻ ☆ ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with + Variational Score Distillation NeurIPS 2023 + + +
+ Score distillation sampling (SDS) has shown great promise in text-to-3D +generation by distilling pretrained large-scale text-to-image diffusion models, +but suffers from over-saturation, over-smoothing, and low-diversity problems. +In this work, we propose to model the 3D parameter as a random variable instead +of a constant as in SDS and present variational score distillation (VSD), a +principled particle-based variational framework to explain and address the +aforementioned issues in text-to-3D generation. We show that SDS is a special +case of VSD and leads to poor samples with both small and large CFG weights. In +comparison, VSD works well with various CFG weights as ancestral sampling from +diffusion models and simultaneously improves the diversity and sample quality +with a common CFG weight (i.e., $7.5$). We further present various improvements +in the design space for text-to-3D such as distillation time schedule and +density initialization, which are orthogonal to the distillation algorithm yet +not well explored. Our overall approach, dubbed ProlificDreamer, can generate +high rendering resolution (i.e., $512\times512$) and high-fidelity NeRF with +rich structure and complex effects (e.g., smoke and drops). Further, +initialized from NeRF, meshes fine-tuned by VSD are meticulously detailed and +photo-realistic. Project page and codes: +https://ml.cs.tsinghua.edu.cn/prolificdreamer/ + +
+
+ comment: NeurIPS 2023 (Spotlight) +
+
+
+
+
+ + ♻ ☆ PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics + + +
+ We introduce PhysGaussian, a new method that seamlessly integrates physically +grounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel +motion synthesis. Employing a custom Material Point Method (MPM), our approach +enriches 3D Gaussian kernels with physically meaningful kinematic deformation +and mechanical stress attributes, all evolved in line with continuum mechanics +principles. A defining characteristic of our method is the seamless integration +between physical simulation and visual rendering: both components utilize the +same 3D Gaussian kernels as their discrete representations. This negates the +necessity for triangle/tetrahedron meshing, marching cubes, "cage meshes," or +any other geometry embedding, highlighting the principle of "what you see is +what you simulate (WS$^2$)." Our method demonstrates exceptional versatility +across a wide variety of materials--including elastic entities, metals, +non-Newtonian fluids, and granular materials--showcasing its strong +capabilities in creating diverse visual content with novel viewpoints and +movements. Our project page is at: https://xpandora.github.io/PhysGaussian/ + +
+
+
+
+
+ + ♻ ☆ GLAD: Global-Local View Alignment and Background Debiasing for + Unsupervised Video Domain Adaptation with Large Domain Gap WACV 2024 + + +
+ In this work, we tackle the challenging problem of unsupervised video domain +adaptation (UVDA) for action recognition. We specifically focus on scenarios +with a substantial domain gap, in contrast to existing works primarily deal +with small domain gaps between labeled source domains and unlabeled target +domains. To establish a more realistic setting, we introduce a novel UVDA +scenario, denoted as Kinetics->BABEL, with a more considerable domain gap in +terms of both temporal dynamics and background shifts. To tackle the temporal +shift, i.e., action duration difference between the source and target domains, +we propose a global-local view alignment approach. To mitigate the background +shift, we propose to learn temporal order sensitive representations by temporal +order learning and background invariant representations by background +augmentation. We empirically validate that the proposed method shows +significant improvement over the existing methods on the Kinetics->BABEL +dataset with a large domain gap. The code is available at +https://github.com/KHUVLL/GLAD. + +
+
+ comment: This is an accepted WACV 2024 paper. Our code is available at + https://github.com/KHUVLL/GLAD +
+
+
+
+
+ + ♻ ☆ LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene + Graphs with Weak Supervision + + +
+ We propose LASER, a neuro-symbolic approach to learn semantic video +representations that capture rich spatial and temporal properties in video data +by leveraging high-level logic specifications. In particular, we formulate the +problem in terms of alignment between raw videos and spatio-temporal logic +specifications. The alignment algorithm leverages a differentiable symbolic +reasoner and a combination of contrastive, temporal, and semantics losses. It +effectively and efficiently trains low-level perception models to extract +fine-grained video representation in the form of a spatio-temporal scene graph +that conforms to the desired high-level specification. In doing so, we explore +a novel methodology that weakly supervises the learning of video semantic +representations through logic specifications. We evaluate our method on two +datasets with rich spatial and temporal specifications: +20BN-Something-Something and MUGEN. We demonstrate that our method learns +better fine-grained video semantics than existing baselines. + +
+
+
+
+
+ + ♻ ☆ Self supervised convolutional kernel based handcrafted feature + harmonization: Enhanced left ventricle hypertension disease phenotyping on + echocardiography + + +
+ Radiomics, a medical imaging technique, extracts quantitative handcrafted +features from images to predict diseases. Harmonization in those features +ensures consistent feature extraction across various imaging devices and +protocols. Methods for harmonization include standardized imaging protocols, +statistical adjustments, and evaluating feature robustness. Myocardial diseases +such as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD) +are diagnosed via echocardiography, but variable imaging settings pose +challenges. Harmonization techniques are crucial for applying handcrafted +features in disease diagnosis in such scenario. Self-supervised learning (SSL) +enhances data understanding within limited datasets and adapts to diverse data +settings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying +superior performance in various tasks. This study focuses on convolutional +filters within SSL, using them as preprocessing to convert images into feature +maps for handcrafted feature harmonization. Our proposed method excelled in +harmonization evaluation and exhibited superior LVH classification performance +compared to existing methods. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Towards Better Data Exploitation in Self-Supervised Monocular Depth + Estimation + + +
+ Depth estimation plays an important role in the robotic perception system. +Self-supervised monocular paradigm has gained significant attention since it +can free training from the reliance on depth annotations. Despite recent +advancements, existing self-supervised methods still underutilize the available +training data, limiting their generalization ability. In this paper, we take +two data augmentation techniques, namely Resizing-Cropping and +Splitting-Permuting, to fully exploit the potential of training datasets. +Specifically, the original image and the generated two augmented images are fed +into the training pipeline simultaneously and we leverage them to conduct +self-distillation. Additionally, we introduce the detail-enhanced DepthNet with +an extra full-scale branch in the encoder and a grid decoder to enhance the +restoration of fine details in depth maps. Experimental results demonstrate our +method can achieve state-of-the-art performance on the KITTI benchmark, with +both raw ground truth and improved ground truth. Moreover, our models also show +superior generalization performance when transferring to Make3D and NYUv2 +datasets. Our codes are available at https://github.com/Sauf4896/BDEdepth. + +
+
+ comment: 8 pages, 6 figures, accepted by IEEE Robotics and Automation Letters + (RA-L, 2023) +
+
+
+
+
+ + ♻ ☆ BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual + Questions + + +
+ Vision Language Models (VLMs), which extend Large Language Models (LLM) by +incorporating visual understanding capability, have demonstrated significant +advancements in addressing open-ended visual question-answering (VQA) tasks. +However, these models cannot accurately interpret images infused with text, a +common occurrence in real-world scenarios. Standard procedures for extracting +information from images often involve learning a fixed set of query embeddings. +These embeddings are designed to encapsulate image contexts and are later used +as soft prompt inputs in LLMs. Yet, this process is limited to the token count, +potentially curtailing the recognition of scenes with text-rich context. To +improve upon them, the present study introduces BLIVA: an augmented version of +InstructBLIP with Visual Assistant. BLIVA incorporates the query embeddings +from InstructBLIP and also directly projects encoded patch embeddings into the +LLM, a technique inspired by LLaVA. This approach assists the model to capture +intricate details potentially missed during the query decoding process. +Empirical evidence demonstrates that our model, BLIVA, significantly enhances +performance in processing text-rich VQA benchmarks (up to 17.76% in OCR-VQA +benchmark) and in undertaking general (not particularly text-rich) VQA +benchmarks (up to 7.9% in Visual Spatial Reasoning benchmark), comparing to our +baseline InstructBLIP. BLIVA demonstrates significant capability in decoding +real-world images, irrespective of text presence. To demonstrate the broad +industry applications enabled by BLIVA, we evaluate the model using a new +dataset comprising YouTube thumbnails paired with question-answer sets across +11 diverse categories. For researchers interested in further exploration, our +code and models are freely accessible at https://github.com/mlpc-ucsd/BLIVA. + +
+
+
+
+
+ + ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models + + +
+ In this work, we address the challenging and emergent problem of novel object +detection (NOD), focusing on the accurate detection of both known and novel +object categories during inference. Traditional object detection algorithms are +inherently closed-set, limiting their capability to handle NOD. We present a +novel approach to transform existing closed-set detectors into open-set +detectors. This transformation is achieved by leveraging the complementary +strengths of pre-trained foundational models, specifically CLIP and SAM, +through our cooperative mechanism. Furthermore, by integrating this mechanism +with state-of-the-art open-set detectors such as GDINO, we establish new +benchmarks in object detection performance. Our method achieves 17.42 mAP in +novel object detection and 42.08 mAP for known objects on the challenging LVIS +dataset. Adapting our approach to the COCO OVD split, we surpass the current +state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our +code is available at +https://github.com/rohit901/cooperative-foundational-models . + +
+
+ comment: Code: https://github.com/rohit901/cooperative-foundational-models +
+
+
+
+
+ + ♻ ☆ Unsupervised Disentangling of Facial Representations with 3D-aware + Latent Diffusion Models + + +
+ Unsupervised learning of facial representations has gained increasing +attention for face understanding ability without heavily relying on large-scale +annotated datasets. However, it remains unsolved due to the coupling of facial +identities, expressions, and external factors like pose and light. Prior +methods primarily focus on 2D factors and pixel-level consistency, leading to +incomplete disentangling and suboptimal performance in downstream tasks. In +this paper, we propose LatentFace, a novel unsupervised disentangling framework +for facial expression and identity representation. We suggest the disentangling +problem should be performed in latent space and propose the solution using a +3D-aware latent diffusion model. First, we introduce a 3D-aware autoencoder to +encode face images into 3D latent embeddings. Second, we propose a novel +representation diffusion model (RDM) to disentangle 3D latent into facial +identity and expression. Consequently, our method achieves state-of-the-art +performance in facial expression recognition and face verification among +unsupervised facial representation learning models. Codes are available at +\url{https://github.com/ryanhe312/LatentFace}. + +
+
+
+
+
+ + ♻ ☆ ChemScraper: Graphics Extraction, Molecular Diagram Parsing, and + Annotated Data Generation for PDF Images + + +
+ Existing visual parsers for molecule diagrams translate pixel-based raster +images such as PNGs to chemical structure representations (e.g., SMILES). +However, PDFs created by word processors including LaTeX and Word provide +explicit locations and shapes for characters, lines, and polygons. We extract +symbols from born-digital PDF molecule images and then apply simple graph +transformations to capture both visual and chemical structure in editable +ChemDraw files (CDXML). Our fast ( PDF $\rightarrow$ visual graph $\rightarrow$ +chemical graph ) pipeline does not require GPUs, Optical Character Recognition +(OCR) or vectorization. We evaluate on standard benchmarks using SMILES +strings, along with a novel evaluation that provides graph-based metrics and +error compilation using LgEval. The geometric information in born-digital PDFs +produces a highly accurate parser, motivating generating training data for +visual parsers that recognize from raster images, with extracted graphics, +visual structure, and chemical structure as annotations. To do this we render +SMILES strings in Indigo, parse molecule structure, and then validate +recognized structure to select correct files. + +
+
+ comment: 20 pages without references, 10 figures, 3 Tables, submitted to + International Journal on Document Analysis and Recognition (IJDAR) +
+
+
+
+
+ + ♻ ☆ ShaDDR: Interactive Example-Based Geometry and Texture Generation via 3D + Shape Detailization and Differentiable Rendering SIGGRAPH + + +
+ We present ShaDDR, an example-based deep generative neural network which +produces a high-resolution textured 3D shape through geometry detailization and +conditional texture generation applied to an input coarse voxel shape. Trained +on a small set of detailed and textured exemplar shapes, our method learns to +detailize the geometry via multi-resolution voxel upsampling and generate +textures on voxel surfaces via differentiable rendering against exemplar +texture images from a few views. The generation is interactive, taking less +than 1 second to produce a 3D model with voxel resolutions up to 512^3. The +generated shape preserves the overall structure of the input coarse voxel +model, while the style of the generated geometric details and textures can be +manipulated through learned latent codes. In the experiments, we show that our +method can generate higher-resolution shapes with plausible and improved +geometric details and clean textures compared to prior works. Furthermore, we +showcase the ability of our method to learn geometric details and textures from +shapes reconstructed from real-world photos. In addition, we have developed an +interactive modeling application to demonstrate the generalizability of our +method to various user inputs and the controllability it offers, allowing users +to interactively sculpt a coarse voxel shape to define the overall structure of +the detailized 3D shape. Code and data are available at +https://github.com/qiminchen/ShaDDR. + +
+
+ comment: Accepted to SIGGRAPH Asia 2023 conference track. Code: + https://github.com/qiminchen/ShaDDR +
+
+
+
+
+ + ♻ ☆ Early Detection of Late Blight Tomato Disease using Histogram Oriented + Gradient based Support Vector Machine + + +
+ The tomato is one of the most important fruits on earth. It plays an +important and useful role in the agricultural production of any country. This +research propose a novel smart technique for early detection of late blight +diseases in tomatoes. This work improve the dataset with an increase in images +from the field (the Plant Village dataset) and proposed a hybrid algorithm +composed of support vector machines (SVM) and histogram-oriented gradients +(HOG) for real-time detection of late blight tomato disease. To propose a +HOG-based SVM model for early detection of late blight tomato leaf disease. To +check the performance of the proposed model in terms of MSE, accuracy, +precision, and recall as compared to Decision Tree and KNN. The integration of +advanced technology in agriculture has the potential to revolutionize the +industry, making it more efficient, sustainable, and profitable. This research +work on the early detection of tomato diseases contributes to the growing +importance of smart farming, the need for climate-smart agriculture, the rising +need to more efficiently utilize natural resources, and the demand for higher +crop yields. The proposed hybrid algorithm of SVM and HOG has significant +potential for the early detection of late blight disease in tomato plants. The +performance of the proposed model against decision tree and KNN algorithms and +the results may assist in selecting the best algorithm for future applications. +The research work can help farmers make data-driven decisions to optimize crop +yield and quality while also reducing the environmental impact of farming +practices. + +
+
+ comment: The article titled "Early Detection of Late Blight Tomato Disease + using Histogram Oriented Gradient based Support Vector Machine" need to be + withdrawn there are other contributors in the improvement of this article +
+
+
+
+
+ + ♻ ☆ Applications of Large Scale Foundation Models for Autonomous Driving + + +
+ Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007, +autonomous driving has been the most active field of AI applications. Recently +powered by large language models (LLMs), chat systems, such as chatGPT and +PaLM, emerge and rapidly become a promising direction to achieve artificial +general intelligence (AGI) in natural language processing (NLP). There comes a +natural thinking that we could employ these abilities to reformulate autonomous +driving. By combining LLM with foundation models, it is possible to utilize the +human knowledge, commonsense and reasoning to rebuild autonomous driving +systems from the current long-tailed AI dilemma. In this paper, we investigate +the techniques of foundation models and LLMs applied for autonomous driving, +categorized as simulation, world model, data annotation and planning or E2E +solutions etc. + +
+
+ comment: 23 pages. arXiv admin note: text overlap with arXiv:2304.03589, + arXiv:2111.05849, arXiv:2306.03000, arXiv:2301.02691, arXiv:2309.16292, + arXiv:2309.17080, arXiv:2309.10228, arXiv:2310.01415 by other authors +
+
+
+
+
+ + ♻ ☆ Pose-Graph Attentional Graph Neural Network for Lidar Place Recognition + + +
+ This paper proposes a pose-graph attentional graph neural network, called +P-GAT, which compares (key)nodes between sequential and non-sequential +sub-graphs for place recognition tasks as opposed to a common frame-to-frame +retrieval problem formulation currently implemented in SOTA place recognition +methods. P-GAT uses the maximum spatial and temporal information between +neighbour cloud descriptors -- generated by an existing encoder -- utilising +the concept of pose-graph SLAM. Leveraging intra- and inter-attention and graph +neural network, P-GAT relates point clouds captured in nearby locations in +Euclidean space and their embeddings in feature space. Experimental results on +the large-scale publically available datasets demonstrate the effectiveness of +our approach in scenes lacking distinct features and when training and testing +environments have different distributions (domain adaptation). Further, an +exhaustive comparison with the state-of-the-art shows improvements in +performance gains. Code is available at +https://github.com/csiro-robotics/P-GAT. + +
+
+ comment: 10 pages, 5 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Surgical Temporal Action-aware Network with Sequence Regularization for + Phase Recognition + + +
+ To assist surgeons in the operating theatre, surgical phase recognition is +critical for developing computer-assisted surgical systems, which requires +comprehensive understanding of surgical videos. Although existing studies made +great progress, there are still two significant limitations worthy of +improvement. First, due to the compromise of resource consumption, frame-wise +visual features are extracted by 2D networks and disregard spatial and temporal +knowledge of surgical actions, which hinders subsequent inter-frame modeling +for phase prediction. Second, these works simply utilize ordinary +classification loss with one-hot phase labels to optimize the phase +predictions, and cannot fully explore surgical videos under inadequate +supervision. To overcome these two limitations, we propose a Surgical Temporal +Action-aware Network with sequence Regularization, named STAR-Net, to recognize +surgical phases more accurately from input videos. Specifically, we propose an +efficient multi-scale surgical temporal action (MS-STA) module, which +integrates visual features with spatial and temporal knowledge of surgical +actions at the cost of 2D networks. Moreover, we devise the dual-classifier +sequence regularization (DSR) to facilitate the training of STAR-Net by the +sequence guidance of an auxiliary classifier with a smaller capacity. Our +STAR-Net with MS-STA and DSR can exploit visual features of surgical actions +with effective regularization, thereby leading to the superior performance of +surgical phase recognition. Extensive experiments on a large-scale gastrectomy +surgery dataset and the public Cholec80 benchmark prove that our STAR-Net +significantly outperforms state-of-the-arts of surgical phase recognition. + +
+
+ comment: Accepted by 2023 IEEE International Conference on Bioinformatics and + Biomedicine (BIBM 2023) +
+
+
+
+
+ + ♻ ☆ Mapping EEG Signals to Visual Stimuli: A Deep Learning Approach to Match + vs. Mismatch Classification + + +
+ Existing approaches to modeling associations between visual stimuli and brain +responses are facing difficulties in handling between-subject variance and +model generalization. Inspired by the recent progress in modeling speech-brain +response, we propose in this work a "match-vs-mismatch" deep learning model to +classify whether a video clip induces excitatory responses in recorded EEG +signals and learn associations between the visual content and corresponding +neural recordings. Using an exclusive experimental dataset, we demonstrate that +the proposed model is able to achieve the highest accuracy on unseen subjects +as compared to other baseline models. Furthermore, we analyze the inter-subject +noise using a subject-level silhouette score in the embedding space and show +that the developed model is able to mitigate inter-subject noise and +significantly reduce the silhouette score. Moreover, we examine the Grad-CAM +activation score and show that the brain regions associated with language +processing contribute most to the model predictions, followed by regions +associated with visual processing. These results have the potential to +facilitate the development of neural recording-based video reconstruction and +its related applications. + +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Drilling Down into the Discourse Structure with LLMs for Long Document + Question Answering EMNLP 2023 + + +
+ We address the task of evidence retrieval for long document question +answering, which involves locating relevant paragraphs within a document to +answer a question. We aim to assess the applicability of large language models +(LLMs) in the task of zero-shot long document evidence retrieval, owing to +their unprecedented performance across various NLP tasks. However, currently +the LLMs can consume limited context lengths as input, thus providing document +chunks as inputs might overlook the global context while missing out on +capturing the inter-segment dependencies. Moreover, directly feeding the large +input sets can incur significant computational costs, particularly when +processing the entire document (and potentially incurring monetary expenses +with enterprise APIs like OpenAI's GPT variants). To address these challenges, +we propose a suite of techniques that exploit the discourse structure commonly +found in documents. By utilizing this structure, we create a condensed +representation of the document, enabling a more comprehensive understanding and +analysis of relationships between different parts. We retain $99.6\%$ of the +best zero-shot approach's performance, while processing only $26\%$ of the +total tokens used by the best approach in the information seeking evidence +retrieval setup. We also show how our approach can be combined with +\textit{self-ask} reasoning agent to achieve best zero-shot performance in +complex multi-hop question answering, just $\approx 4\%$ short of zero-shot +performance using gold evidence. + +
+
+ comment: Accepted to the Findings of EMNLP 2023 +
+
+
+
+
+ + ☆ LM-Cocktail: Resilient Tuning of Language Models via Model Merging + + +
+ The pre-trained language models are continually fine-tuned to better support +downstream applications. However, this operation may result in significant +performance degeneration on general tasks beyond the targeted domain. To +overcome this problem, we propose a novel method which enables the fine-tuned +model to stay resilient in general perspectives. Our method is conducted in the +form of model merging (namely LM-Cocktail), where the fine-tuned language model +is merged with the pre-trained base model or the peer models from other domains +through weighted average. Despite simplicity, LM-Cocktail is surprisingly +effective: the resulted model is able to achieve a strong empirical performance +in the whole scope of general tasks while preserving a superior capacity in its +targeted domain. We conduct comprehensive experiments with LLama and BGE model +on popular benchmarks, including FLAN, MMLU, MTEB, whose results validate the +efficacy of our proposed method. The code and checkpoints are available at +https://github.com/FlagOpen/FlagEmbedding. + +
+
+
+
+
+ + ☆ A Comparative Analysis of Supportive Navigation on Movie Recommenders + + +
+ This literature review covers the research and thought process that went into +making a solution for the infinite scrolling problem faced in streaming +services such as Netflix. Using the data collected, we have come to the +conclusion that an alternate layout can somewhat alleviate the problems it +takes in navigating a list of movies. We also found out by a comparative +analysis that some layouts, the circular one in particular, is advantageous in +certain settings making it an ideal candidate for a movie recommender system. + +
+
+ comment: This was an extensive survey and prototyping we did to purpose and + alternative user interface for movie recommender systems like Netflix +
+
+
+
+
+ + ☆ Fact-based Court Judgment Prediction + + +
+ This extended abstract extends the research presented in "ILDC for CJPE: +Indian Legal Documents Corpus for Court Judgment Prediction and Explanation" +\cite{malik-etal-2021-ildc}, focusing on fact-based judgment prediction within +the context of Indian legal documents. We introduce two distinct problem +variations: one based solely on facts, and another combining facts with rulings +from lower courts (RLC). Our research aims to enhance early-phase case outcome +prediction, offering significant benefits to legal professionals and the +general public. The results, however, indicated a performance decline compared +to the original ILDC for CJPE study, even after implementing various weightage +schemes in our DELSumm algorithm. Additionally, using only facts for legal +judgment prediction with different transformer models yielded results inferior +to the state-of-the-art outcomes reported in the "ILDC for CJPE" study. + +
+
+
+
+
+ + ☆ Hierarchical Matrix Factorization for Interpretable Collaborative + Filtering + + +
+ Matrix factorization (MF) is a simple collaborative filtering technique that +achieves superior recommendation accuracy by decomposing the user-item rating +matrix into user and item latent matrices. This approach relies on learning +from user-item interactions, which may not effectively capture the underlying +shared dependencies between users or items. Therefore, there is scope to +explicitly capture shared dependencies to further improve recommendation +accuracy and the interpretability of learning results by summarizing user-item +interactions. Based on these insights, we propose "Hierarchical Matrix +Factorization" (HMF), which incorporates clustering concepts to capture the +hierarchy, where leaf nodes and other nodes correspond to users/items and +clusters, respectively. Central to our approach, called hierarchical +embeddings, is the additional decomposition of the user and item latent +matrices (embeddings) into probabilistic connection matrices, which link the +hierarchy, and a root cluster latent matrix. Thus, each node is represented by +the weighted average of the embeddings of its parent clusters. The embeddings +are differentiable, allowing simultaneous learning of interactions and +clustering using a single gradient descent method. Furthermore, the obtained +cluster-specific interactions naturally summarize user-item interactions and +provide interpretability. Experimental results on rating and ranking +predictions demonstrated the competitiveness of HMF over vanilla and +hierarchical MF methods, especially its robustness in sparse interactions. +Additionally, it was confirmed that the clustering integration of HMF has the +potential for faster learning convergence and mitigation of overfitting +compared to MF, and also provides interpretability through a cluster-centered +case study. + +
+
+
+
+
+ + ☆ GENET: Unleashing the Power of Side Information for Recommendation via + Hypergraph Pre-training + + +
+ Recommendation with side information has drawn significant research interest +due to its potential to mitigate user feedback sparsity. However, existing +models struggle with generalization across diverse domains and types of side +information. In particular, three challenges have not been addressed, and they +are (1) the diverse formats of side information, including text sequences. (2) +The diverse semantics of side information that describes items and users from +multi-level in a context different from recommendation systems. (3) The diverse +correlations in side information to measure similarity over multiple objects +beyond pairwise relations. In this paper, we introduce GENET (Generalized +hypErgraph pretraiNing on sidE informaTion), which pre-trains user and item +representations on feedback-irrelevant side information and fine-tunes the +representations on user feedback data. GENET leverages pre-training as a means +to prevent side information from overshadowing critical ID features and +feedback signals. It employs a hypergraph framework to accommodate various +types of diverse side information. During pre-training, GENET integrates tasks +for hyperlink prediction and self-supervised contrast to capture fine-grained +semantics at both local and global levels. Additionally, it introduces a unique +strategy to enhance pre-training robustness by perturbing positive samples +while maintaining high-order relations. Extensive experiments demonstrate that +GENET exhibits strong generalization capabilities, outperforming the SOTA +method by up to 38% in TOP-N recommendation and Sequential recommendation tasks +on various datasets with different side information. + +
+
+
+
+
+ + ☆ Physics-driven generative adversarial networks empower single-pixel + infrared hyperspectral imaging + + +
+ A physics-driven generative adversarial network (GAN) was established here +for single-pixel hyperspectral imaging (HSI) in the infrared spectrum, to +eliminate the extensive data training work required by traditional data-driven +model. Within the GAN framework, the physical process of single-pixel imaging +(SPI) was integrated into the generator, and the actual and estimated +one-dimensional (1D) bucket signals were employed as constraints in the +objective function to update the network's parameters and optimize the +generator with the assistance of the discriminator. In comparison to +single-pixel infrared HSI methods based on compressed sensing and +physics-driven convolution neural networks, our physics-driven GAN-based +single-pixel infrared HSI can achieve higher imaging performance but with fewer +measurements. We believe that this physics-driven GAN will promote practical +applications of computational imaging, especially various SPI-based techniques. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+
+
+
+ + Machine Learning 132 + +
+
+
+ + ☆ Visual In-Context Prompting + + +
+ In-context prompting in large language models (LLMs) has become a prevalent +approach to improve zero-shot capabilities, but this idea is less explored in +the vision domain. Existing visual prompting methods focus on referring +segmentation to segment the most relevant object, falling short of addressing +many generic vision tasks like open-set segmentation and detection. In this +paper, we introduce a universal visual in-context prompting framework for both +tasks. In particular, we build on top of an encoder-decoder architecture, and +develop a versatile prompt encoder to support a variety of prompts like +strokes, boxes, and points. We further enhance it to take an arbitrary number +of reference image segments as the context. Our extensive explorations show +that the proposed visual in-context prompting elicits extraordinary referring +and generic segmentation capabilities to refer and detect, yielding competitive +performance to close-set in-domain datasets and showing promising results on +many open-set segmentation datasets. By joint training on COCO and SA-1B, our +model achieves $57.7$ PQ on COCO and $23.2$ PQ on ADE20K. Code will be +available at https://github.com/UX-Decoder/DINOv. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ ZipLoRA: Any Subject in Any Style by Effectively Merging LoRAs + + +
+ Methods for finetuning generative models for concept-driven personalization +generally achieve strong results for subject-driven or style-driven generation. +Recently, low-rank adaptations (LoRA) have been proposed as a +parameter-efficient way of achieving concept-driven personalization. While +recent work explores the combination of separate LoRAs to achieve joint +generation of learned styles and subjects, existing techniques do not reliably +address the problem; they often compromise either subject fidelity or style +fidelity. We propose ZipLoRA, a method to cheaply and effectively merge +independently trained style and subject LoRAs in order to achieve generation of +any user-provided subject in any user-provided style. Experiments on a wide +range of subject and style combinations show that ZipLoRA can generate +compelling results with meaningful improvements over baselines in subject and +style fidelity while preserving the ability to recontextualize. Project page: +https://ziplora.github.io + +
+
+ comment: Project page: https://ziplora.github.io +
+
+
+
+
+ + ☆ Covariance alignment: from maximum likelihood estimation to + Gromov-Wasserstein + + +
+ Feature alignment methods are used in many scientific disciplines for data +pooling, annotation, and comparison. As an instance of a permutation learning +problem, feature alignment presents significant statistical and computational +challenges. In this work, we propose the covariance alignment model to study +and compare various alignment methods and establish a minimax lower bound for +covariance alignment that has a non-standard dimension scaling because of the +presence of a nuisance parameter. This lower bound is in fact minimax optimal +and is achieved by a natural quasi MLE. However, this estimator involves a +search over all permutations which is computationally infeasible even when the +problem has moderate size. To overcome this limitation, we show that the +celebrated Gromov-Wasserstein algorithm from optimal transport which is more +amenable to fast implementation even on large-scale problems is also minimax +optimal. These results give the first statistical justification for the +deployment of the Gromov-Wasserstein algorithm in practice. + +
+
+ comment: 41 pages, 2 figures +
+
+
+
+
+ + ☆ Labeling Neural Representations with Inverse Recognition + + +
+ Deep Neural Networks (DNNs) demonstrated remarkable capabilities in learning +complex hierarchical data representations, but the nature of these +representations remains largely unknown. Existing global explainability +methods, such as Network Dissection, face limitations such as reliance on +segmentation masks, lack of statistical significance testing, and high +computational demands. We propose Inverse Recognition (INVERT), a scalable +approach for connecting learned representations with human-understandable +concepts by leveraging their capacity to discriminate between these concepts. +In contrast to prior work, INVERT is capable of handling diverse types of +neurons, exhibits less computational complexity, and does not rely on the +availability of segmentation masks. Moreover, INVERT provides an interpretable +metric assessing the alignment between the representation and its corresponding +explanation and delivering a measure of statistical significance, emphasizing +its utility and credibility. We demonstrate the applicability of INVERT in +various scenarios, including the identification of representations affected by +spurious correlations, and the interpretation of the hierarchical structure of +decision-making within the models. + +
+
+ comment: 24 pages, 16 figures +
+
+
+
+
+ + ☆ Risk-sensitive Markov Decision Process and Learning under General + Utility Functions + + +
+ Reinforcement Learning (RL) has gained substantial attention across diverse +application domains and theoretical investigations. Existing literature on RL +theory largely focuses on risk-neutral settings where the decision-maker learns +to maximize the expected cumulative reward. However, in practical scenarios +such as portfolio management and e-commerce recommendations, decision-makers +often persist in heterogeneous risk preferences subject to outcome +uncertainties, which can not be well-captured by the risk-neural framework. +Incorporating these preferences can be approached through utility theory, yet +the development of risk-sensitive RL under general utility functions remains an +open question for theoretical exploration. + In this paper, we consider a scenario where the decision-maker seeks to +optimize a general utility function of the cumulative reward in the framework +of a Markov decision process (MDP). To facilitate the Dynamic Programming +Principle and Bellman equation, we enlarge the state space with an additional +dimension that accounts for the cumulative reward. We propose a discretized +approximation scheme to the MDP under enlarged state space, which is tractable +and key for algorithmic design. We then propose a modified value iteration +algorithm that employs an epsilon-covering over the space of cumulative reward. +When a simulator is accessible, our algorithm efficiently learns a near-optimal +policy with guaranteed sample complexity. In the absence of a simulator, our +algorithm, designed with an upper-confidence-bound exploration approach, +identifies a near-optimal policy while ensuring a guaranteed regret bound. For +both algorithms, we match the theoretical lower bounds for the risk-neutral +setting. + +
+
+ comment: 36 pages +
+
+
+
+
+ + ☆ A Survey of Serverless Machine Learning Model Inference + + +
+ Recent developments in Generative AI, Computer Vision, and Natural Language +Processing have led to an increased integration of AI models into various +products. This widespread adoption of AI requires significant efforts in +deploying these models in production environments. When hosting machine +learning models for real-time predictions, it is important to meet defined +Service Level Objectives (SLOs), ensuring reliability, minimal downtime, and +optimizing operational costs of the underlying infrastructure. Large machine +learning models often demand GPU resources for efficient inference to meet +SLOs. In the context of these trends, there is growing interest in hosting AI +models in a serverless architecture while still providing GPU access for +inference tasks. This survey aims to summarize and categorize the emerging +challenges and optimization opportunities for large-scale deep learning serving +systems. By providing a novel taxonomy and summarizing recent trends, we hope +that this survey could shed light on new optimization perspectives and motivate +novel works in large-scale deep learning serving systems. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ On diffusion-based generative models and their error bounds: The + log-concave case with full convergence estimates + + +
+ We provide full theoretical guarantees for the convergence behaviour of +diffusion-based generative models under the assumption of strongly logconcave +data distributions while our approximating class of functions used for score +estimation is made of Lipschitz continuous functions. We demonstrate via a +motivating example, sampling from a Gaussian distribution with unknown mean, +the powerfulness of our approach. In this case, explicit estimates are provided +for the associated optimization problem, i.e. score approximation, while these +are combined with the corresponding sampling estimates. As a result, we obtain +the best known upper bound estimates in terms of key quantities of interest, +such as the dimension and rates of convergence, for the Wasserstein-2 distance +between the data distribution (Gaussian with unknown mean) and our sampling +algorithm. + Beyond the motivating example and in order to allow for the use of a diverse +range of stochastic optimizers, we present our results using an $L^2$-accurate +score estimation assumption, which crucially is formed under an expectation +with respect to the stochastic optimizer and our novel auxiliary process that +uses only known information. This approach yields the best known convergence +rate for our sampling algorithm. + +
+
+
+
+
+ + ☆ Adaptive Sampling for Deep Learning via Efficient Nonparametric Proxies + + +
+ Data sampling is an effective method to improve the training speed of neural +networks, with recent results demonstrating that it can even break the neural +scaling laws. These results critically rely on high-quality scores to estimate +the importance of an input to the network. We observe that there are two +dominant strategies: static sampling, where the scores are determined before +training, and dynamic sampling, where the scores can depend on the model +weights. Static algorithms are computationally inexpensive but less effective +than their dynamic counterparts, which can cause end-to-end slowdown due to +their need to explicitly compute losses. To address this problem, we propose a +novel sampling distribution based on nonparametric kernel regression that +learns an effective importance score as the neural network trains. However, +nonparametric regression models are too computationally expensive to accelerate +end-to-end training. Therefore, we develop an efficient sketch-based +approximation to the Nadaraya-Watson estimator. Using recent techniques from +high-dimensional statistics and randomized algorithms, we prove that our +Nadaraya-Watson sketch approximates the estimator with exponential convergence +guarantees. Our sampling algorithm outperforms the baseline in terms of +wall-clock time and accuracy on four datasets. + +
+
+
+
+
+ + ☆ $σ$-PCA: a unified neural model for linear and nonlinear principal + component analysis + + +
+ Linear principal component analysis (PCA), nonlinear PCA, and linear +independent component analysis (ICA) -- those are three methods with +single-layer autoencoder formulations for learning linear transformations from +data. Linear PCA learns orthogonal transformations (rotations) that orient axes +to maximise variance, but it suffers from a subspace rotational indeterminacy: +it fails to find a unique rotation for axes that share the same variance. Both +nonlinear PCA and linear ICA reduce the subspace indeterminacy from rotational +to permutational by maximising statistical independence under the assumption of +unit variance. The main difference between them is that nonlinear PCA only +learns rotations while linear ICA learns not just rotations but any linear +transformation with unit variance. The relationship between all three can be +understood by the singular value decomposition of the linear ICA transformation +into a sequence of rotation, scale, rotation. Linear PCA learns the first +rotation; nonlinear PCA learns the second. The scale is simply the inverse of +the standard deviations. The problem is that, in contrast to linear PCA, +conventional nonlinear PCA cannot be used directly on the data to learn the +first rotation, the first being special as it reduces dimensionality and orders +by variances. In this paper, we have identified the cause, and as a solution we +propose $\sigma$-PCA: a unified neural model for linear and nonlinear PCA as +single-layer autoencoders. One of its key ingredients: modelling not just the +rotation but also the scale -- the variances. This model bridges the disparity +between linear and nonlinear PCA. And so, like linear PCA, it can learn a +semi-orthogonal transformation that reduces dimensionality and orders by +variances, but, unlike linear PCA, it does not suffer from rotational +indeterminacy. + +
+
+
+
+
+ + ☆ A Unified Framework for Trace-induced Quantum Kernels + + +
+ Quantum kernel methods are promising candidates for achieving a practical +quantum advantage for certain machine learning tasks. Similar to classical +machine learning, an exact form of a quantum kernel is expected to have a great +impact on the model performance. In this work we combine all trace-induced +quantum kernels, including the commonly-used global fidelity and local +projected quantum kernels, into a common framework. We show how generalized +trace-induced quantum kernels can be constructed as combinations of the +fundamental building blocks we coin "Lego" kernels, which impose an inductive +bias on the resulting quantum models. We relate the expressive power and +generalization ability to the number of non-zero weight Lego kernels and +propose a systematic approach to increase the complexity of a quantum kernel +model, leading to a new form of the local projected kernels that require fewer +quantum resources in terms of the number of quantum gates and measurement +shots. We show numerically that models based on local projected kernels can +achieve comparable performance to the global fidelity quantum kernel. Our work +unifies existing quantum kernels and provides a systematic framework to compare +their properties. + +
+
+ comment: 12 + 15 pages, 5 figures +
+
+
+
+
+ + ☆ Efficient Numerical Integration in Reproducing Kernel Hilbert Spaces via + Leverage Scores Sampling + + +
+ In this work we consider the problem of numerical integration, i.e., +approximating integrals with respect to a target probability measure using only +pointwise evaluations of the integrand. We focus on the setting in which the +target distribution is only accessible through a set of $n$ i.i.d. +observations, and the integrand belongs to a reproducing kernel Hilbert space. +We propose an efficient procedure which exploits a small i.i.d. random subset +of $m +
+
+ comment: 46 pages, 5 figures. Submitted to JMLR +
+
+
+
+
+ + ☆ Linear Log-Normal Attention with Unbiased Concentration ICLR2024 + + +
+ Transformer models have achieved remarkable results in a wide range of +applications. However, their scalability is hampered by the quadratic time and +memory complexity of the self-attention mechanism concerning the sequence +length. This limitation poses a substantial obstacle when dealing with long +documents or high-resolution images. In this work, we study the self-attention +mechanism by analyzing the distribution of the attention matrix and its +concentration ability. Furthermore, we propose instruments to measure these +quantities and introduce a novel self-attention mechanism, Linear Log-Normal +Attention, designed to emulate the distribution and concentration behavior of +the original self-attention. Our experimental results on popular natural +language benchmarks reveal that our proposed Linear Log-Normal Attention +outperforms other linearized attention alternatives, offering a promising +avenue for enhancing the scalability of transformer models. Our code is +available in supplementary materials. + +
+
+ comment: 22 pages, 20 figures, 5 tables, submitted to ICLR2024 +
+
+
+
+
+ + ☆ Learned Nonlinear Predictor for Critically Sampled 3D Point Cloud + Attribute Compression + + +
+ We study 3D point cloud attribute compression via a volumetric approach: +assuming point cloud geometry is known at both encoder and decoder, parameters +$\theta$ of a continuous attribute function $f: \mathbb{R}^3 \mapsto +\mathbb{R}$ are quantized to $\hat{\theta}$ and encoded, so that discrete +samples $f_{\hat{\theta}}(\mathbf{x}_i)$ can be recovered at known 3D points +$\mathbf{x}_i \in \mathbb{R}^3$ at the decoder. Specifically, we consider a +nested sequences of function subspaces $\mathcal{F}^{(p)}_{l_0} \subseteq +\cdots \subseteq \mathcal{F}^{(p)}_L$, where $\mathcal{F}_l^{(p)}$ is a family +of functions spanned by B-spline basis functions of order $p$, $f_l^*$ is the +projection of $f$ on $\mathcal{F}_l^{(p)}$ and encoded as low-pass coefficients +$F_l^*$, and $g_l^*$ is the residual function in orthogonal subspace +$\mathcal{G}_l^{(p)}$ (where $\mathcal{G}_l^{(p)} \oplus \mathcal{F}_l^{(p)} = +\mathcal{F}_{l+1}^{(p)}$) and encoded as high-pass coefficients $G_l^*$. In +this paper, to improve coding performance over [1], we study predicting +$f_{l+1}^*$ at level $l+1$ given $f_l^*$ at level $l$ and encoding of $G_l^*$ +for the $p=1$ case (RAHT($1$)). For the prediction, we formalize RAHT(1) linear +prediction in MPEG-PCC in a theoretical framework, and propose a new nonlinear +predictor using a polynomial of bilateral filter. We derive equations to +efficiently compute the critically sampled high-pass coefficients $G_l^*$ +amenable to encoding. We optimize parameters in our resulting feed-forward +network on a large training set of point clouds by minimizing a rate-distortion +Lagrangian. Experimental results show that our improved framework outperformed +the MPEG G-PCC predictor by $11$ to $12\%$ in bit rate reduction. + +
+
+
+
+
+ + ☆ Speak Like a Native: Prompting Large Language Models in a Native Style + + +
+ Existing work has found that the prompt engineering heavily influences the +performance of large language models (LLMs). Chain-of-thought (CoT), as a +popular prompt engineering technique, prompted LLMs using in-context examples +with reasoning steps. In current studies, the few-shot examples of CoT are +generally handcrafted by humans. However, how the text style of in-context +examples influence the outputs of LLMs still remains under-explored. This paper +presents a novel and effective approach, named \textbf{AlignCoT}, to improve +the reasoning capability of LLMs by aligning the in-context examples with the +native style of LLMs. ``Native'' refers to the inherent characteristic style of +LLMs which can be probed by original zero-shot scenarios. AlignCoT is +orthogonal to other prompt engineering methods, making it easy to combine with +state-of-the-art techniques to further improve the LLMs' performance. We +conduct extensive and comprehensive experiments on several benchmarks. The +empirical results demonstrate that our AlignCoTsignificantly improves +performance over the carefully handcrafted in-context examples. For instance, +with GPT-3.5-turbo, we observed a +2.5\% improvement on GSM8K. Furthermore, our +AlignCoT consistently improve the performance when combined with other +state-of-the-art prompt engineering methods. The source code and dataset will +be available at +\href{https://github.com/yangzhch6/AlignCoT}{https://github.com/yangzhch6/AlignCoT}. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Leveraging CNNs and Ensemble Learning for Automated Disaster Image + Classification SC + + +
+ Natural disasters act as a serious threat globally, requiring effective and +efficient disaster management and recovery. This paper focuses on classifying +natural disaster images using Convolutional Neural Networks (CNNs). Multiple +CNN architectures were built and trained on a dataset containing images of +earthquakes, floods, wildfires, and volcanoes. A stacked CNN ensemble approach +proved to be the most effective, achieving 95% accuracy and an F1 score going +up to 0.96 for individual classes. Tuning hyperparameters of individual models +for optimization was critical to maximize the models' performance. The stacking +of CNNs with XGBoost acting as the meta-model utilizes the strengths of the CNN +and ResNet models to improve the overall accuracy of the classification. +Results obtained from the models illustrated the potency of CNN-based models +for automated disaster image classification. This lays the foundation for +expanding these techniques to build robust systems for disaster response, +damage assessment, and recovery management. + +
+
+ comment: 13 pages, 11 figures, 4 tables, ICSISCET 2023 Conference +
+
+
+
+
+ + ☆ Naturalness of Attention: Revisiting Attention in Code Language Models ICSE + + +
+ Language models for code such as CodeBERT offer the capability to learn +advanced source code representation, but their opacity poses barriers to +understanding of captured properties. Recent attention analysis studies provide +initial interpretability insights by focusing solely on attention weights +rather than considering the wider context modeling of Transformers. This study +aims to shed some light on the previously ignored factors of the attention +mechanism beyond the attention weights. We conduct an initial empirical study +analyzing both attention distributions and transformed representations in +CodeBERT. Across two programming languages, Java and Python, we find that the +scaled transformation norms of the input better capture syntactic structure +compared to attention weights alone. Our analysis reveals characterization of +how CodeBERT embeds syntactic code properties. The findings demonstrate the +importance of incorporating factors beyond just attention weights for +rigorously understanding neural code models. This lays the groundwork for +developing more interpretable models and effective uses of attention mechanisms +in program analysis. + +
+
+ comment: Accepted at ICSE-NIER (2024) track +
+
+
+
+
+ + ☆ Applying Dimensionality Reduction as Precursor to LSTM-CNN Models for + Classifying Imagery and Motor Signals in ECoG-Based BCIs + + +
+ Motor impairments, frequently caused by neurological incidents like strokes +or traumatic brain injuries, present substantial obstacles in rehabilitation +therapy. This research aims to elevate the field by optimizing motor imagery +classification algorithms within Brain-Computer Interfaces (BCIs). By improving +the efficiency of BCIs, we offer a novel approach that holds significant +promise for enhancing motor rehabilitation outcomes. Utilizing unsupervised +techniques for dimensionality reduction, namely Uniform Manifold Approximation +and Projection (UMAP) coupled with K-Nearest Neighbors (KNN), we evaluate the +necessity of employing supervised methods such as Long Short-Term Memory (LSTM) +and Convolutional Neural Networks (CNNs) for classification tasks. Importantly, +participants who exhibited high KNN scores following UMAP dimensionality +reduction also achieved high accuracy in supervised deep learning (DL) models. +Due to individualized model requirements and massive neural training data, +dimensionality reduction becomes an effective preprocessing step that minimizes +the need for extensive data labeling and supervised deep learning techniques. +This approach has significant implications not only for targeted therapies in +motor dysfunction but also for addressing regulatory, safety, and reliability +concerns in the rapidly evolving BCI field. + +
+
+ comment: 10 Pages, 12 Figures. The dataset used in this paper can be found + here: https://osf.io/ksqv8/download, from the Miller 2010 paper. All code + used in this research can be found at + https://github.com/bafanaS/dim-reduction-with-cnn-lstm.git +
+
+
+
+
+ + ☆ Bitformer: An efficient Transformer with bitwise operation-based + attention for Big Data Analytics at low-cost low-precision devices + + +
+ In the current landscape of large models, the Transformer stands as a +cornerstone, playing a pivotal role in shaping the trajectory of modern models. +However, its application encounters challenges attributed to the substantial +computational intricacies intrinsic to its attention mechanism. Moreover, its +reliance on high-precision floating-point operations presents specific hurdles, +particularly evident in computation-intensive scenarios such as edge computing +environments. These environments, characterized by resource-constrained devices +and a preference for lower precision, necessitate innovative solutions. + To tackle the exacting data processing demands posed by edge devices, we +introduce the Bitformer model, an inventive extension of the Transformer +paradigm. Central to this innovation is a novel attention mechanism that +adeptly replaces conventional floating-point matrix multiplication with bitwise +operations. This strategic substitution yields dual advantages. Not only does +it maintain the attention mechanism's prowess in capturing intricate long-range +information dependencies, but it also orchestrates a profound reduction in the +computational complexity inherent in the attention operation. The transition +from an $O(n^2d)$ complexity, typical of floating-point operations, to an +$O(n^2T)$ complexity characterizing bitwise operations, substantiates this +advantage. Notably, in this context, the parameter $T$ remains markedly smaller +than the conventional dimensionality parameter $d$. + The Bitformer model in essence endeavors to reconcile the indomitable +requirements of modern computing landscapes with the constraints posed by edge +computing scenarios. By forging this innovative path, we bridge the gap between +high-performing models and resource-scarce environments, thus unveiling a +promising trajectory for further advancements in the field. + +
+
+
+
+
+ + ☆ Current Topological and Machine Learning Applications for Bias Detection + in Text + + +
+ Institutional bias can impact patient outcomes, educational attainment, and +legal system navigation. Written records often reflect bias, and once bias is +identified; it is possible to refer individuals for training to reduce bias. +Many machine learning tools exist to explore text data and create predictive +models that can search written records to identify real-time bias. However, few +previous studies investigate large language model embeddings and geometric +models of biased text data to understand geometry's impact on bias modeling +accuracy. To overcome this issue, this study utilizes the RedditBias database +to analyze textual biases. Four transformer models, including BERT and RoBERTa +variants, were explored. Post-embedding, t-SNE allowed two-dimensional +visualization of data. KNN classifiers differentiated bias types, with lower +k-values proving more effective. Findings suggest BERT, particularly mini BERT, +excels in bias classification, while multilingual models lag. The +recommendation emphasizes refining monolingual models and exploring +domain-specific biases. + +
+
+
+
+
+ + ☆ Grad-Shafranov equilibria via data-free physics informed neural networks + + +
+ A large number of magnetohydrodynamic (MHD) equilibrium calculations are +often required for uncertainty quantification, optimization, and real-time +diagnostic information, making MHD equilibrium codes vital to the field of +plasma physics. In this paper, we explore a method for solving the +Grad-Shafranov equation by using Physics-Informed Neural Networks (PINNs). For +PINNs, we optimize neural networks by directly minimizing the residual of the +PDE as a loss function. We show that PINNs can accurately and effectively solve +the Grad-Shafranov equation with several different boundary conditions. We also +explore the parameter space by varying the size of the model, the learning +rate, and boundary conditions to map various trade-offs such as between +reconstruction error and computational speed. Additionally, we introduce a +parameterized PINN framework, expanding the input space to include variables +such as pressure, aspect ratio, elongation, and triangularity in order to +handle a broader range of plasma scenarios within a single network. +Parametrized PINNs could be used in future work to solve inverse problems such +as shape optimization. + +
+
+
+
+
+ + ☆ Benchmarking Toxic Molecule Classification using Graph Neural Networks + and Few Shot Learning + + +
+ Traditional methods like Graph Convolutional Networks (GCNs) face challenges +with limited data and class imbalance, leading to suboptimal performance in +graph classification tasks during toxicity prediction of molecules as a whole. +To address these issues, we harness the power of Graph Isomorphic Networks, +Multi Headed Attention and Free Large-scale Adversarial Augmentation separately +on Graphs for precisely capturing the structural data of molecules and their +toxicological properties. Additionally, we incorporate Few-Shot Learning to +improve the model's generalization with limited annotated samples. Extensive +experiments on a diverse toxicology dataset demonstrate that our method +achieves an impressive state-of-art AUC-ROC value of 0.816, surpassing the +baseline GCN model by 11.4%. This highlights the significance of our proposed +methodology and Few Shot Learning in advancing Toxic Molecular Classification, +with the potential to enhance drug discovery and environmental risk assessment +processes. + +
+
+
+
+
+ + ☆ Deep-learning-based acceleration of MRI for radiotherapy planning of + pediatric patients with brain tumors + + +
+ Magnetic Resonance Imaging (MRI) is a non-invasive diagnostic and +radiotherapy (RT) planning tool, offering detailed insights into the anatomy of +the human body. The extensive scan time is stressful for patients, who must +remain motionless in a prolonged imaging procedure that prioritizes reduction +of imaging artifacts. This is challenging for pediatric patients who may +require measures for managing voluntary motions such as anesthesia. Several +computational approaches reduce scan time (fast MRI), by recording fewer +measurements and digitally recovering full information via post-acquisition +reconstruction. However, most fast MRI approaches were developed for diagnostic +imaging, without addressing reconstruction challenges specific to RT planning. +In this work, we developed a deep learning-based method (DeepMRIRec) for MRI +reconstruction from undersampled data acquired with RT-specific receiver coil +arrangements. We evaluated our method against fully sampled data of T1-weighted +MR images acquired from 73 children with brain tumors/surgical beds using loop +and posterior coils (12 channels), with and without applying virtual +compression of coil elements. DeepMRIRec reduced scanning time by a factor of +four producing a structural similarity score surpassing the evaluated +state-of-the-art method (0.960 vs 0.896), thereby demonstrating its potential +for accelerating MRI scanning for RT planning. + +
+
+
+
+
+ + ☆ Machine Translation to Control Formality Features in the Target Language + + +
+ Formality plays a significant role in language communication, especially in +low-resource languages such as Hindi, Japanese and Korean. These languages +utilise formal and informal expressions to convey messages based on social +contexts and relationships. When a language translation technique is used to +translate from a source language that does not pertain the formality (e.g. +English) to a target language that does, there is a missing information on +formality that could be a challenge in producing an accurate outcome. This +research explores how this issue should be resolved when machine learning +methods are used to translate from English to languages with formality, using +Hindi as the example data. This was done by training a bilingual model in a +formality-controlled setting and comparing its performance with a pre-trained +multilingual model in a similar setting. Since there are not a lot of training +data with ground truth, automated annotation techniques were employed to +increase the data size. The primary modeling approach involved leveraging +transformer models, which have demonstrated effectiveness in various natural +language processing tasks. We evaluate the official formality accuracy(ACC) by +comparing the predicted masked tokens with the ground truth. This metric +provides a quantitative measure of how well the translations align with the +desired outputs. Our study showcases a versatile translation strategy that +considers the nuances of formality in the target language, catering to diverse +language communication needs and scenarios. + +
+
+ comment: 9 pages, based on DCU MCM Practicum 2022/2023 +
+
+
+
+
+ + ☆ Comparative Analysis of Linear Regression, Gaussian Elimination, and LU + Decomposition for CT Real Estate Purchase Decisions + + +
+ This paper presents a comprehensive evaluation of three distinct +computational algorithms applied to the decision-making process of real estate +purchases. Specifically, we analyze the efficacy of Linear Regression from +Scikit-learn library, Gaussian Elimination with partial pivoting, and LU +Decomposition in predicting the advisability of buying a house in the State of +Connecticut based on a set of financial and market-related parameters. The +algorithms' performances were compared using a dataset encompassing +town-specific details, yearly data, interest rates, and median sale ratios. Our +results demonstrate significant differences in predictive accuracy, with Linear +Regression and LU Decomposition providing the most reliable recommendations and +Gaussian Elimination showing limitations in stability and performance. The +study's findings emphasize the importance of algorithm selection in predictive +analytic and offer insights into the practical applications of computational +methods in real estate investment strategies. By evaluating model efficacy +through metrics such as R-squared scores and Mean Squared Error, we provide a +nuanced understanding of each method's strengths and weaknesses, contributing +valuable knowledge to the fields of real estate analysis and predictive +modeling. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ☆ Span-Based Optimal Sample Complexity for Average Reward MDPs + + +
+ We study the sample complexity of learning an $\varepsilon$-optimal policy in +an average-reward Markov decision process (MDP) under a generative model. We +establish the complexity bound $\widetilde{O}\left(SA\frac{H}{\varepsilon^2} +\right)$, where $H$ is the span of the bias function of the optimal policy and +$SA$ is the cardinality of the state-action space. Our result is the first that +is minimax optimal (up to log factors) in all parameters $S,A,H$ and +$\varepsilon$, improving on existing work that either assumes uniformly bounded +mixing times for all policies or has suboptimal dependence on the parameters. + Our result is based on reducing the average-reward MDP to a discounted MDP. +To establish the optimality of this reduction, we develop improved bounds for +$\gamma$-discounted MDPs, showing that +$\widetilde{O}\left(SA\frac{H}{(1-\gamma)^2\varepsilon^2} \right)$ samples +suffice to learn a $\varepsilon$-optimal policy in weakly communicating MDPs +under the regime that $\gamma \geq 1 - \frac{1}{H}$, circumventing the +well-known lower bound of +$\widetilde{\Omega}\left(SA\frac{1}{(1-\gamma)^3\varepsilon^2} \right)$ for +general $\gamma$-discounted MDPs. Our analysis develops upper bounds on certain +instance-dependent variance parameters in terms of the span parameter. These +bounds are tighter than those based on the mixing time or diameter of the MDP +and may be of broader use. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ Accelerating Inference in Molecular Diffusion Models with Latent + Representations of Protein Structure NeurIPS 2023 + + +
+ Diffusion generative models have emerged as a powerful framework for +addressing problems in structural biology and structure-based drug design. +These models operate directly on 3D molecular structures. Due to the +unfavorable scaling of graph neural networks (GNNs) with graph size as well as +the relatively slow inference speeds inherent to diffusion models, many +existing molecular diffusion models rely on coarse-grained representations of +protein structure to make training and inference feasible. However, such +coarse-grained representations discard essential information for modeling +molecular interactions and impair the quality of generated structures. In this +work, we present a novel GNN-based architecture for learning latent +representations of molecular structure. When trained end-to-end with a +diffusion model for de novo ligand design, our model achieves comparable +performance to one with an all-atom protein representation while exhibiting a +3-fold reduction in inference time. + +
+
+ comment: This paper appeared as a spotlight paper at the NeurIPS 2023 + Generative AI and Biology Workshop +
+
+
+
+
+ + ☆ Multi-Objective Bayesian Optimization with Active Preference Learning + + +
+ There are a lot of real-world black-box optimization problems that need to +optimize multiple criteria simultaneously. However, in a multi-objective +optimization (MOO) problem, identifying the whole Pareto front requires the +prohibitive search cost, while in many practical scenarios, the decision maker +(DM) only needs a specific solution among the set of the Pareto optimal +solutions. We propose a Bayesian optimization (BO) approach to identifying the +most preferred solution in the MOO with expensive objective functions, in which +a Bayesian preference model of the DM is adaptively estimated by an interactive +manner based on the two types of supervisions called the pairwise preference +and improvement request. To explore the most preferred solution, we define an +acquisition function in which the uncertainty both in the objective functions +and the DM preference is incorporated. Further, to minimize the interaction +cost with the DM, we also propose an active learning strategy for the +preference estimation. We empirically demonstrate the effectiveness of our +proposed method through the benchmark function optimization and the +hyper-parameter optimization problems for machine learning models. + +
+
+
+
+
+ + ☆ The Tempered Hilbert Simplex Distance and Its Application To Non-linear + Embeddings of TEMs + + +
+ Tempered Exponential Measures (TEMs) are a parametric generalization of the +exponential family of distributions maximizing the tempered entropy function +among positive measures subject to a probability normalization of their power +densities. Calculus on TEMs relies on a deformed algebra of arithmetic +operators induced by the deformed logarithms used to define the tempered +entropy. In this work, we introduce three different parameterizations of finite +discrete TEMs via Legendre functions of the negative tempered entropy function. +In particular, we establish an isometry between such parameterizations in terms +of a generalization of the Hilbert log cross-ratio simplex distance to a +tempered Hilbert co-simplex distance. Similar to the Hilbert geometry, the +tempered Hilbert distance is characterized as a $t$-symmetrization of the +oriented tempered Funk distance. We motivate our construction by introducing +the notion of $t$-lengths of smooth curves in a tautological Finsler manifold. +We then demonstrate the properties of our generalized structure in different +settings and numerically examine the quality of its differentiable +approximations for optimization in machine learning settings. + +
+
+
+
+
+ + ☆ Explaining high-dimensional text classifiers NeurIPS 2023 + + +
+ Explainability has become a valuable tool in the last few years, helping +humans better understand AI-guided decisions. However, the classic +explainability tools are sometimes quite limited when considering +high-dimensional inputs and neural network classifiers. We present a new +explainability method using theoretically proven high-dimensional properties in +neural network classifiers. We present two usages of it: 1) On the classical +sentiment analysis task for the IMDB reviews dataset, and 2) our +Malware-Detection task for our PowerShell scripts dataset. + +
+
+ comment: Accepted to "XAI in Action" workshop @ NeurIPS 2023 +
+
+
+
+
+ + ☆ Differentially Private Non-Convex Optimization under the KL Condition + with Optimal Rates + + +
+ We study private empirical risk minimization (ERM) problem for losses +satisfying the $(\gamma,\kappa)$-Kurdyka-{\L}ojasiewicz (KL) condition. The +Polyak-{\L}ojasiewicz (PL) condition is a special case of this condition when +$\kappa=2$. Specifically, we study this problem under the constraint of $\rho$ +zero-concentrated differential privacy (zCDP). When $\kappa\in[1,2]$ and the +loss function is Lipschitz and smooth over a sufficiently large region, we +provide a new algorithm based on variance reduced gradient descent that +achieves the rate +$\tilde{O}\big(\big(\frac{\sqrt{d}}{n\sqrt{\rho}}\big)^\kappa\big)$ on the +excess empirical risk, where $n$ is the dataset size and $d$ is the dimension. +We further show that this rate is nearly optimal. When $\kappa \geq 2$ and the +loss is instead Lipschitz and weakly convex, we show it is possible to achieve +the rate $\tilde{O}\big(\big(\frac{\sqrt{d}}{n\sqrt{\rho}}\big)^\kappa\big)$ +with a private implementation of the proximal point method. When the KL +parameters are unknown, we provide a novel modification and analysis of the +noisy gradient descent algorithm and show that this algorithm achieves a rate +of +$\tilde{O}\big(\big(\frac{\sqrt{d}}{n\sqrt{\rho}}\big)^{\frac{2\kappa}{4-\kappa}}\big)$ +adaptively, which is nearly optimal when $\kappa = 2$. We further show that, +without assuming the KL condition, the same gradient descent algorithm can +achieve fast convergence to a stationary point when the gradient stays +sufficiently large during the run of the algorithm. Specifically, we show that +this algorithm can approximate stationary points of Lipschitz, smooth (and +possibly nonconvex) objectives with rate as fast as +$\tilde{O}\big(\frac{\sqrt{d}}{n\sqrt{\rho}}\big)$ and never worse than +$\tilde{O}\big(\big(\frac{\sqrt{d}}{n\sqrt{\rho}}\big)^{1/2}\big)$. The latter +rate matches the best known rate for methods that do not rely on variance +reduction. + +
+
+
+
+
+ + ☆ Transfer Attacks and Defenses for Large Language Models on Coding Tasks + + +
+ Modern large language models (LLMs), such as ChatGPT, have demonstrated +impressive capabilities for coding tasks including writing and reasoning about +code. They improve upon previous neural network models of code, such as +code2seq or seq2seq, that already demonstrated competitive results when +performing tasks such as code summarization and identifying code +vulnerabilities. However, these previous code models were shown vulnerable to +adversarial examples, i.e. small syntactic perturbations that do not change the +program's semantics, such as the inclusion of "dead code" through false +conditions or the addition of inconsequential print statements, designed to +"fool" the models. LLMs can also be vulnerable to the same adversarial +perturbations but a detailed study on this concern has been lacking so far. In +this paper we aim to investigate the effect of adversarial perturbations on +coding tasks with LLMs. In particular, we study the transferability of +adversarial examples, generated through white-box attacks on smaller code +models, to LLMs. Furthermore, to make the LLMs more robust against such +adversaries without incurring the cost of retraining, we propose prompt-based +defenses that involve modifying the prompt to include additional information +such as examples of adversarially perturbed code and explicit instructions for +reversing adversarial perturbations. Our experiments show that adversarial +examples obtained with a smaller code model are indeed transferable, weakening +the LLMs' performance. The proposed defenses show promise in improving the +model's resilience, paving the way to more robust defensive solutions for LLMs +in code-related applications. + +
+
+
+
+
+ + ☆ Guided Flows for Generative Modeling and Decision Making + + +
+ Classifier-free guidance is a key component for improving the performance of +conditional generative models for many downstream tasks. It drastically +improves the quality of samples produced, but has so far only been used for +diffusion models. Flow Matching (FM), an alternative simulation-free approach, +trains Continuous Normalizing Flows (CNFs) based on regressing vector fields. +It remains an open question whether classifier-free guidance can be performed +for Flow Matching models, and to what extent does it improve performance. In +this paper, we explore the usage of Guided Flows for a variety of downstream +applications involving conditional image generation, speech synthesis, and +reinforcement learning. In particular, we are the first to apply flow models to +the offline reinforcement learning setting. We also show that Guided Flows +significantly improves the sample quality in image generation and zero-shot +text-to-speech synthesis, and can make use of drastically low amounts of +computation without affecting the agent's overall performance. + +
+
+
+
+
+ + ☆ Recurrent neural networks and transfer learning for elasto-plasticity in + woven composites + + +
+ As a surrogate for computationally intensive meso-scale simulation of woven +composites, this article presents Recurrent Neural Network (RNN) models. +Leveraging the power of transfer learning, the initialization challenges and +sparse data issues inherent in cyclic shear strain loads are addressed in the +RNN models. A mean-field model generates a comprehensive data set representing +elasto-plastic behavior. In simulations, arbitrary six-dimensional strain +histories are used to predict stresses under random walking as the source task +and cyclic loading conditions as the target task. Incorporating sub-scale +properties enhances RNN versatility. In order to achieve accurate predictions, +the model uses a grid search method to tune network architecture and +hyper-parameter configurations. The results of this study demonstrate that +transfer learning can be used to effectively adapt the RNN to varying strain +conditions, which establishes its potential as a useful tool for modeling +path-dependent responses in woven composites. + +
+
+ comment: There are 25 pages and 13 EPS images. The paper includes links to + supporting materials +
+
+
+
+
+ + ☆ Extracting individual variable information for their decoupling, direct + mutual information and multi-feature Granger causality + + +
+ Working with multiple variables they usually contain difficult to control +complex dependencies. This article proposes extraction of their individual +information, e.g. $\overline{X|Y}$ as random variable containing information +from $X$, but with removed information about $Y$, by using $(x,y) +\leftrightarrow (\bar{x}=\textrm{CDF}_{X|Y=y}(x),y)$ reversible normalization. +One application can be decoupling of individual information of variables: +reversibly transform $(X_1,\ldots,X_n)\leftrightarrow(\tilde{X}_1,\ldots +\tilde{X}_n)$ together containing the same information, but being independent: +$\forall_{i\neq j} \tilde{X}_i\perp \tilde{X}_j, \tilde{X}_i\perp X_j$. It +requires detailed models of complex conditional probability distributions - it +is generally a difficult task, but here can be done through multiple dependency +reducing iterations, using imperfect methods (here HCR: Hierarchical +Correlation Reconstruction). It could be also used for direct mutual +information - evaluating direct information transfer: without use of +intermediate variables. For causality direction there is discussed +multi-feature Granger causality, e.g. to trace various types of individual +information transfers between such decoupled variables, including propagation +time (delay). + +
+
+ comment: 3 pages, 1 figure +
+
+
+
+
+ + ☆ From Images to Connections: Can DQN with GNNs learn the Strategic Game + of Hex? + + +
+ The gameplay of strategic board games such as chess, Go and Hex is often +characterized by combinatorial, relational structures -- capturing distinct +interactions and non-local patterns -- and not just images. Nonetheless, most +common self-play reinforcement learning (RL) approaches simply approximate +policy and value functions using convolutional neural networks (CNN). A key +feature of CNNs is their relational inductive bias towards locality and +translational invariance. In contrast, graph neural networks (GNN) can encode +more complicated and distinct relational structures. Hence, we investigate the +crucial question: Can GNNs, with their ability to encode complex connections, +replace CNNs in self-play reinforcement learning? To this end, we do a +comparison with Hex -- an abstract yet strategically rich board game -- serving +as our experimental platform. Our findings reveal that GNNs excel at dealing +with long range dependency situations in game states and are less prone to +overfitting, but also showing a reduced proficiency in discerning local +patterns. This suggests a potential paradigm shift, signaling the use of +game-specific structures to reshape self-play reinforcement learning. + +
+
+
+
+
+ + ☆ Bayesian inference of a new Mallows model for characterising symptom + sequences applied in primary progressive aphasia ML4H + + +
+ Machine learning models offer the potential to understand diverse datasets in +a data-driven way, powering insights into individual disease experiences and +ensuring equitable healthcare. In this study, we explore Bayesian inference for +characterising symptom sequences, and the associated modelling challenges. We +adapted the Mallows model to account for partial rankings and right-censored +data, employing custom MCMC fitting. Our evaluation, encompassing synthetic +data and a primary progressive aphasia dataset, highlights the model's efficacy +in revealing mean orderings and estimating ranking variance. This holds the +potential to enhance clinical comprehension of symptom occurrence. However, our +work encounters limitations concerning model scalability and small dataset +sizes. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 8 pages +
+
+
+
+
+ + ☆ Confidant: Customizing Transformer-based LLMs via Collaborative Edge + Training + + +
+ Transformer-based large language models (LLMs) have demonstrated impressive +capabilities in a variety of natural language processing (NLP) tasks. +Nonetheless, it is challenging to deploy and fine-tune LLMs on mobile edge +devices with limited computing, memory, and energy budgets. In this paper, we +propose Confidant, a multi-backend collaborative training framework for +customizing state-of-the-art LLMs on commodity mobile devices like smartphones. +Confidant partitions an LLM into several sub-models so that each fits into a +mobile device's memory. A pipeline parallel training mechanism is further +developed to ensure fast and efficient distributed training. In addition, we +propose a novel backend scheduler to allocate different attention heads to +heterogeneous compute hardware, including mobile CPU and GPUs, to maximize the +compute resource utilization on each edge device. Our preliminary experimental +results show that Confidant achieves at most 45.3% memory reduction and 8.03x +inference speedup in practical settings. + +
+
+ comment: 6 pages, 7 figures; Submitted to HotMobile 2024 +
+
+
+
+
+ + ☆ An Empirical Study of Uncertainty Estimation Techniques for Detecting + Drift in Data Streams NeurIPS 2023 + + +
+ In safety-critical domains such as autonomous driving and medical diagnosis, +the reliability of machine learning models is crucial. One significant +challenge to reliability is concept drift, which can cause model deterioration +over time. Traditionally, drift detectors rely on true labels, which are often +scarce and costly. This study conducts a comprehensive empirical evaluation of +using uncertainty values as substitutes for error rates in detecting drifts, +aiming to alleviate the reliance on labeled post-deployment data. We examine +five uncertainty estimation methods in conjunction with the ADWIN detector +across seven real-world datasets. Our results reveal that while the SWAG method +exhibits superior calibration, the overall accuracy in detecting drifts is not +notably impacted by the choice of uncertainty estimation method, with even the +most basic method demonstrating competitive performance. These findings offer +valuable insights into the practical applicability of uncertainty-based drift +detection in real-world, safety-critical applications. + +
+
+ comment: NeurIPS 2023: Workshop on Distribution Shifts +
+
+
+
+
+ + ☆ Unified Classification and Rejection: A One-versus-All Framework + + +
+ Classifying patterns of known classes and rejecting ambiguous and novel (also +called as out-of-distribution (OOD)) inputs are involved in open world pattern +recognition. Deep neural network models usually excel in closed-set +classification while performing poorly in rejecting OOD. To tackle this +problem, numerous methods have been designed to perform open set recognition +(OSR) or OOD rejection/detection tasks. Previous methods mostly take +post-training score transformation or hybrid models to ensure low scores on OOD +inputs while separating known classes. In this paper, we attempt to build a +unified framework for building open set classifiers for both classification and +OOD rejection. We formulate the open set recognition of $ K $-known-class as a +$ (K + 1) $-class classification problem with model trained on known-class +samples only. By decomposing the $ K $-class problem into $ K $ one-versus-all +(OVA) binary classification tasks and binding some parameters, we show that +combining the scores of OVA classifiers can give $ (K + 1) $-class posterior +probabilities, which enables classification and OOD rejection in a unified +framework. To maintain the closed-set classification accuracy of the OVA +trained classifier, we propose a hybrid training strategy combining OVA loss +and multi-class cross-entropy loss. We implement the OVA framework and hybrid +training strategy on the recently proposed convolutional prototype network. +Experiments on popular OSR and OOD detection datasets demonstrate that the +proposed framework, using a single multi-class classifier, yields competitive +performance in closed-set classification, OOD detection, and misclassification +detection. + +
+
+
+
+
+ + ☆ Fact-based Court Judgment Prediction + + +
+ This extended abstract extends the research presented in "ILDC for CJPE: +Indian Legal Documents Corpus for Court Judgment Prediction and Explanation" +\cite{malik-etal-2021-ildc}, focusing on fact-based judgment prediction within +the context of Indian legal documents. We introduce two distinct problem +variations: one based solely on facts, and another combining facts with rulings +from lower courts (RLC). Our research aims to enhance early-phase case outcome +prediction, offering significant benefits to legal professionals and the +general public. The results, however, indicated a performance decline compared +to the original ILDC for CJPE study, even after implementing various weightage +schemes in our DELSumm algorithm. Additionally, using only facts for legal +judgment prediction with different transformer models yielded results inferior +to the state-of-the-art outcomes reported in the "ILDC for CJPE" study. + +
+
+
+
+
+ + ☆ REDS: Resource-Efficient Deep Subnetworks for Dynamic Resource + Constraints + + +
+ Deep models deployed on edge devices frequently encounter resource +variability, which arises from fluctuating energy levels, timing constraints, +or prioritization of other critical tasks within the system. State-of-the-art +machine learning pipelines generate resource-agnostic models, not capable to +adapt at runtime. In this work we introduce Resource-Efficient Deep Subnetworks +(REDS) to tackle model adaptation to variable resources. In contrast to the +state-of-the-art, REDS use structured sparsity constructively by exploiting +permutation invariance of neurons, which allows for hardware-specific +optimizations. Specifically, REDS achieve computational efficiency by (1) +skipping sequential computational blocks identified by a novel iterative +knapsack optimizer, and (2) leveraging simple math to re-arrange the order of +operations in REDS computational graph to take advantage of the data cache. +REDS support conventional deep networks frequently deployed on the edge and +provide computational benefits even for small and simple networks. We evaluate +REDS on six benchmark architectures trained on the Google Speech Commands, +FMNIST and CIFAR10 datasets, and test on four off-the-shelf mobile and embedded +hardware platforms. We provide a theoretical result and empirical evidence for +REDS outstanding performance in terms of submodels' test set accuracy, and +demonstrate an adaptation time in response to dynamic resource constraints of +under 40$\mu$s, utilizing a 2-layer fully-connected network on Arduino Nano 33 +BLE Sense. + +
+
+
+
+
+ + ☆ MergeSFL: Split Federated Learning with Feature Merging and Batch Size + Regulation + + +
+ Recently, federated learning (FL) has emerged as a popular technique for edge +AI to mine valuable knowledge in edge computing (EC) systems. To mitigate the +computing/communication burden on resource-constrained workers and protect +model privacy, split federated learning (SFL) has been released by integrating +both data and model parallelism. Despite resource limitations, SFL still faces +two other critical challenges in EC, i.e., statistical heterogeneity and system +heterogeneity. To address these challenges, we propose a novel SFL framework, +termed MergeSFL, by incorporating feature merging and batch size regulation in +SFL. Concretely, feature merging aims to merge the features from workers into a +mixed feature sequence, which is approximately equivalent to the features +derived from IID data and is employed to promote model accuracy. While batch +size regulation aims to assign diverse and suitable batch sizes for +heterogeneous workers to improve training efficiency. Moreover, MergeSFL +explores to jointly optimize these two strategies upon their coupled +relationship to better enhance the performance of SFL. Extensive experiments +are conducted on a physical platform with 80 NVIDIA Jetson edge devices, and +the experimental results show that MergeSFL can improve the final model +accuracy by 5.82% to 26.22%, with a speedup by about 1.74x to 4.14x, compared +to the baselines. + +
+
+
+
+
+ + ☆ Learning principle and mathematical realization of the learning + mechanism in the brain + + +
+ While deep learning has achieved remarkable success, there is no clear +explanation about why it works so well. In order to discuss this question +quantitatively, we need a mathematical framework that explains what learning is +in the first place. After several considerations, we succeeded in constructing +a mathematical framework that can provide a unified understanding of all types +of learning, including deep learning and learning in the brain. We call it +learning principle, and it follows that all learning is equivalent to +estimating the probability of input data. We not only derived this principle, +but also mentioned its application to actual machine learning models. For +example, we found that conventional supervised learning is equivalent to +estimating conditional probabilities, and succeeded in making supervised +learning more effective and generalized. We also proposed a new method of +defining the values of estimated probability using differentiation, and showed +that unsupervised learning can be performed on arbitrary dataset without any +prior knowledge. Namely, this method is a general-purpose machine learning in +the true sense. Moreover, we succeeded in describing the learning mechanism in +the brain by considering the time evolution of a fully or partially connected +model and applying this new method. The learning principle provides solutions +to many unsolved problems in deep learning and cognitive neuroscience. + +
+
+ comment: 31 pages, 14 figures +
+
+
+
+
+ + ☆ Curriculum Learning and Imitation Learning for Model-free Control on + Financial Time-series + + +
+ Curriculum learning and imitation learning have been leveraged extensively in +the robotics domain. However, minimal research has been done on leveraging +these ideas on control tasks over highly stochastic time-series data. Here, we +theoretically and empirically explore these approaches in a representative +control task over complex time-series data. We implement the fundamental ideas +of curriculum learning via data augmentation, while imitation learning is +implemented via policy distillation from an oracle. Our findings reveal that +curriculum learning should be considered a novel direction in improving +control-task performance over complex time-series. Our ample random-seed +out-sample empirics and ablation studies are highly encouraging for curriculum +learning for time-series control. These findings are especially encouraging as +we tune all overlapping hyperparameters on the baseline -- giving an advantage +to the baseline. On the other hand, we find that imitation learning should be +used with caution. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Revisiting Supervision for Continual Representation Learning + + +
+ In the field of continual learning, models are designed to learn tasks one +after the other. While most research has centered on supervised continual +learning, recent studies have highlighted the strengths of self-supervised +continual representation learning. The improved transferability of +representations built with self-supervised methods is often associated with the +role played by the multi-layer perceptron projector. In this work, we depart +from this observation and reexamine the role of supervision in continual +representation learning. We reckon that additional information, such as human +annotations, should not deteriorate the quality of representations. Our +findings show that supervised models when enhanced with a multi-layer +perceptron head, can outperform self-supervised models in continual +representation learning. + +
+
+
+
+
+ + ☆ Deep Learning for Vascular Segmentation and Applications in Phase + Contrast Tomography Imaging + + +
+ Automated blood vessel segmentation is vital for biomedical imaging, as +vessel changes indicate many pathologies. Still, precise segmentation is +difficult due to the complexity of vascular structures, anatomical variations +across patients, the scarcity of annotated public datasets, and the quality of +images. We present a thorough literature review, highlighting the state of +machine learning techniques across diverse organs. Our goal is to provide a +foundation on the topic and identify a robust baseline model for application to +vascular segmentation in a new imaging modality, Hierarchical Phase Contrast +Tomography (HiP CT). Introduced in 2020 at the European Synchrotron Radiation +Facility, HiP CT enables 3D imaging of complete organs at an unprecedented +resolution of ca. 20mm per voxel, with the capability for localized zooms in +selected regions down to 1mm per voxel without sectioning. We have created a +training dataset with double annotator validated vascular data from three +kidneys imaged with HiP CT in the context of the Human Organ Atlas Project. +Finally, utilising the nnU Net model, we conduct experiments to assess the +models performance on both familiar and unseen samples, employing vessel +specific metrics. Our results show that while segmentations yielded reasonably +high scores such as clDice values ranging from 0.82 to 0.88, certain errors +persisted. Large vessels that collapsed due to the lack of hydrostatic pressure +(HiP CT is an ex vivo technique) were segmented poorly. Moreover, decreased +connectivity in finer vessels and higher segmentation errors at vessel +boundaries were observed. Such errors obstruct the understanding of the +structures by interrupting vascular tree connectivity. Through our review and +outputs, we aim to set a benchmark for subsequent model evaluations using +various modalities, especially with the HiP CT imaging database. + +
+
+
+
+
+ + ☆ Probabilistic Inference in Reinforcement Learning Done Right NeurIPS 2023 + + +
+ A popular perspective in Reinforcement learning (RL) casts the problem as +probabilistic inference on a graphical model of the Markov decision process +(MDP). The core object of study is the probability of each state-action pair +being visited under the optimal policy. Previous approaches to approximate this +quantity can be arbitrarily poor, leading to algorithms that do not implement +genuine statistical inference and consequently do not perform well in +challenging problems. In this work, we undertake a rigorous Bayesian treatment +of the posterior probability of state-action optimality and clarify how it +flows through the MDP. We first reveal that this quantity can indeed be used to +generate a policy that explores efficiently, as measured by regret. +Unfortunately, computing it is intractable, so we derive a new variational +Bayesian approximation yielding a tractable convex optimization problem and +establish that the resulting policy also explores efficiently. We call our +approach VAPOR and show that it has strong connections to Thompson sampling, +K-learning, and maximum entropy exploration. We conclude with some experiments +demonstrating the performance advantage of a deep RL version of VAPOR. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ The Influence of Neural Networks on Hydropower Plant Management in + Agriculture: Addressing Challenges and Exploring Untapped Opportunities + + +
+ Hydropower plants are crucial for stable renewable energy and serve as vital +water sources for sustainable agriculture. However, it is essential to assess +the current water management practices associated with hydropower plant +management software. A key concern is the potential conflict between +electricity generation and agricultural water needs. Prioritising water for +electricity generation can reduce irrigation availability in agriculture during +crucial periods like droughts, impacting crop yields and regional food +security. Coordination between electricity and agricultural water allocation is +necessary to ensure optimal and environmentally sound practices. Neural +networks have become valuable tools for hydropower plant management, but their +black-box nature raises concerns about transparency in decision making. +Additionally, current approaches often do not take advantage of their potential +to create a system that effectively balances water allocation. + This work is a call for attention and highlights the potential risks of +deploying neural network-based hydropower plant management software without +proper scrutiny and control. To address these concerns, we propose the adoption +of the Agriculture Conscious Hydropower Plant Management framework, aiming to +maximise electricity production while prioritising stable irrigation for +agriculture. We also advocate reevaluating government-imposed minimum water +guidelines for irrigation to ensure flexibility and effective water allocation. +Additionally, we suggest a set of regulatory measures to promote model +transparency and robustness, certifying software that makes conscious and +intelligent water allocation decisions, ultimately safeguarding agriculture +from undue strain during droughts. + +
+
+
+
+
+ + ☆ Improving performance of heart rate time series classification by + grouping subjects + + +
+ Unlike the more commonly analyzed ECG or PPG data for activity +classification, heart rate time series data is less detailed, often noisier and +can contain missing data points. Using the BigIdeasLab_STEP dataset, which +includes heart rate time series annotated with specific tasks performed by +individuals, we sought to determine if general classification was achievable. +Our analyses showed that the accuracy is sensitive to the choice of +window/stride size. Moreover, we found variable classification performances +between subjects due to differences in the physical structure of their hearts. +Various techniques were used to minimize this variability. First of all, +normalization proved to be a crucial step and significantly improved the +performance. Secondly, grouping subjects and performing classification inside a +group helped to improve performance and decrease inter-subject variability. +Finally, we show that including handcrafted features as input to a deep +learning (DL) network improves the classification performance further. +Together, these findings indicate that heart rate time series can be utilized +for classification tasks like predicting activity. However, normalization or +grouping techniques need to be chosen carefully to minimize the issue of +subject variability. + +
+
+
+
+
+ + ☆ Comprehensive Evaluation of GNN Training Systems: A Data Management + Perspective + + +
+ Many Graph Neural Network (GNN) training systems have emerged recently to +support efficient GNN training. Since GNNs embody complex data dependencies +between training samples, the training of GNNs should address distinct +challenges different from DNN training in data management, such as data +partitioning, batch preparation for mini-batch training, and data transferring +between CPUs and GPUs. These factors, which take up a large proportion of +training time, make data management in GNN training more significant. This +paper reviews GNN training from a data management perspective and provides a +comprehensive analysis and evaluation of the representative approaches. We +conduct extensive experiments on various benchmark datasets and show many +interesting and valuable results. We also provide some practical tips learned +from these experiments, which are helpful for designing GNN training systems in +the future. + +
+
+ comment: 12 pages, 17 figures +
+
+
+
+
+ + ☆ FedFN: Feature Normalization for Alleviating Data Heterogeneity Problem + in Federated Learning NeurIPS + + +
+ Federated Learning (FL) is a collaborative method for training models while +preserving data privacy in decentralized settings. However, FL encounters +challenges related to data heterogeneity, which can result in performance +degradation. In our study, we observe that as data heterogeneity increases, +feature representation in the FedAVG model deteriorates more significantly +compared to classifier weight. Additionally, we observe that as data +heterogeneity increases, the gap between higher feature norms for observed +classes, obtained from local models, and feature norms of unobserved classes +widens, in contrast to the behavior of classifier weight norms. This widening +gap extends to encompass the feature norm disparities between local and the +global models. To address these issues, we introduce Federated Averaging with +Feature Normalization Update (FedFN), a straightforward learning method. We +demonstrate the superior performance of FedFN through extensive experiments, +even when applied to pretrained ResNet18. Subsequently, we confirm the +applicability of FedFN to foundation models. + +
+
+ comment: NeurIPS Workshop: "Federated Learning in the Age of Foundation + Models" 2023 +
+
+
+
+
+ + ☆ Improved identification accuracy in equation learning via comprehensive + $\boldsymbol{R^2}$-elimination and Bayesian model selection + + +
+ In the field of equation learning, exhaustively considering all possible +equations derived from a basis function dictionary is infeasible. Sparse +regression and greedy algorithms have emerged as popular approaches to tackle +this challenge. However, the presence of multicollinearity poses difficulties +for sparse regression techniques, and greedy steps may inadvertently exclude +terms of the true equation, leading to reduced identification accuracy. In this +article, we present an approach that strikes a balance between +comprehensiveness and efficiency in equation learning. Inspired by stepwise +regression, our approach combines the coefficient of determination, $R^2$, and +the Bayesian model evidence, $p(\boldsymbol y|\mathcal M)$, in a novel way. Our +procedure is characterized by a comprehensive search with just a minor +reduction of the model space at each iteration step. With two flavors of our +approach and the adoption of $p(\boldsymbol y|\mathcal M)$ for bi-directional +stepwise regression, we present a total of three new avenues for equation +learning. Through three extensive numerical experiments involving random +polynomials and dynamical systems, we compare our approach against four +state-of-the-art methods and two standard approaches. The results demonstrate +that our comprehensive search approach surpasses all other methods in terms of +identification accuracy. In particular, the second flavor of our approach +establishes an efficient overfitting penalty solely based on $R^2$, which +achieves highest rates of exact equation recovery. + +
+
+ comment: 12 pages main text and 11 pages appendix, accepted in Transactions on + Machine Learning Research (TMLR) +
+
+
+
+
+ + ☆ Immunohistochemistry guided segmentation of benign epithelial cells, in + situ lesions, and invasive epithelial cells in breast cancer slides + + +
+ Digital pathology enables automatic analysis of histopathological sections +using artificial intelligence (AI). Automatic evaluation could improve +diagnostic efficiency and help find associations between morphological features +and clinical outcome. For development of such prediction models, identifying +invasive epithelial cells, and separating these from benign epithelial cells +and in situ lesions would be the first step. In this study, we aimed to develop +an AI model for segmentation of epithelial cells in sections from breast +cancer. We generated epithelial ground truth masks by restaining hematoxylin +and eosin (HE) sections with cytokeratin (CK) AE1/AE3, and by pathologists' +annotations. HE/CK image pairs were used to train a convolutional neural +network, and data augmentation was used to make the model more robust. Tissue +microarrays (TMAs) from 839 patients, and whole slide images from two patients +were used for training and evaluation of the models. The sections were derived +from four cohorts of breast cancer patients. TMAs from 21 patients from a fifth +cohort was used as a second test set. In quantitative evaluation, a mean Dice +score of 0.70, 0.79, and 0.75 for invasive epithelial cells, benign epithelial +cells, and in situ lesions, respectively, were achieved. In qualitative scoring +(0-5) by pathologists, results were best for all epithelium and invasive +epithelium, with scores of 4.7 and 4.4. Scores for benign epithelium and in +situ lesions were 3.7 and 2.0. The proposed model segmented epithelial cells in +HE stained breast cancer slides well, but further work is needed for accurate +division between the classes. Immunohistochemistry, together with pathologists' +annotations, enabled the creation of accurate ground truths. The model is made +freely available in FastPathology and the code is available at +https://github.com/AICAN-Research/breast-epithelium-segmentation + +
+
+ comment: 19 pages, 6 figures. Submitted to a scientific journal +
+
+
+
+
+ + ☆ ViStruct: Visual Structural Knowledge Extraction via Curriculum Guided + Code-Vision Representation EMNLP 2023 + + +
+ State-of-the-art vision-language models (VLMs) still have limited performance +in structural knowledge extraction, such as relations between objects. In this +work, we present ViStruct, a training framework to learn VLMs for effective +visual structural knowledge extraction. Two novel designs are incorporated. +First, we propose to leverage the inherent structure of programming language to +depict visual structural information. This approach enables explicit and +consistent representation of visual structural information of multiple +granularities, such as concepts, relations, and events, in a well-organized +structured format. Second, we introduce curriculum-based learning for VLMs to +progressively comprehend visual structures, from fundamental visual concepts to +intricate event structures. Our intuition is that lower-level knowledge may +contribute to complex visual structure understanding. Furthermore, we compile +and release a collection of datasets tailored for visual structural knowledge +extraction. We adopt a weakly-supervised approach to directly generate visual +event structures from captions for ViStruct training, capitalizing on abundant +image-caption pairs from the web. In experiments, we evaluate ViStruct on +visual structure prediction tasks, demonstrating its effectiveness in improving +the understanding of visual structures. The code is public at +\url{https://github.com/Yangyi-Chen/vi-struct}. + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ☆ Towards Hetero-Client Federated Multi-Task Learning + + +
+ Federated Learning (FL) enables joint training across distributed clients +using their local data privately. Federated Multi-Task Learning (FMTL) builds +on FL to handle multiple tasks, assuming model congruity that identical model +architecture is deployed in each client. To relax this assumption and thus +extend real-world applicability, we introduce a novel problem setting, +Hetero-Client Federated Multi-Task Learning (HC-FMTL), to accommodate diverse +task setups. The main challenge of HC-FMTL is the model incongruity issue that +invalidates conventional aggregation methods. It also escalates the +difficulties in accurate model aggregation to deal with data and task +heterogeneity inherent in FMTL. To address these challenges, we propose the +FedHCA$^2$ framework, which allows for federated training of personalized +models by modeling relationships among heterogeneous clients. Drawing on our +theoretical insights into the difference between multi-task and federated +optimization, we propose the Hyper Conflict-Averse Aggregation scheme to +mitigate conflicts during encoder updates. Additionally, inspired by task +interaction in MTL, the Hyper Cross Attention Aggregation scheme uses +layer-wise cross attention to enhance decoder interactions while alleviating +model incongruity. Moreover, we employ learnable Hyper Aggregation Weights for +each client to customize personalized parameter updates. Extensive experiments +demonstrate the superior performance of FedHCA$^2$ in various HC-FMTL scenarios +compared to representative methods. Our code will be made publicly available. + +
+
+
+
+
+ + ☆ Hard Label Black Box Node Injection Attack on Graph Neural Networks + + +
+ While graph neural networks have achieved state-of-the-art performances in +many real-world tasks including graph classification and node classification, +recent works have demonstrated they are also extremely vulnerable to +adversarial attacks. Most previous works have focused on attacking node +classification networks under impractical white-box scenarios. In this work, we +will propose a non-targeted Hard Label Black Box Node Injection Attack on Graph +Neural Networks, which to the best of our knowledge, is the first of its kind. +Under this setting, more real world tasks can be studied because our attack +assumes no prior knowledge about (1): the model architecture of the GNN we are +attacking; (2): the model's gradients; (3): the output logits of the target GNN +model. Our attack is based on an existing edge perturbation attack, from which +we restrict the optimization process to formulate a node injection attack. In +the work, we will evaluate the performance of the attack using three datasets, +COIL-DEL, IMDB-BINARY, and NCI1. + +
+
+
+
+
+ + ☆ Using Human Feedback to Fine-tune Diffusion Models without Any Reward + Model + + +
+ Using reinforcement learning with human feedback (RLHF) has shown significant +promise in fine-tuning diffusion models. Previous methods start by training a +reward model that aligns with human preferences, then leverage RL techniques to +fine-tune the underlying models. However, crafting an efficient reward model +demands extensive datasets, optimal architecture, and manual hyperparameter +tuning, making the process both time and cost-intensive. The direct preference +optimization (DPO) method, effective in fine-tuning large language models, +eliminates the necessity for a reward model. However, the extensive GPU memory +requirement of the diffusion model's denoising process hinders the direct +application of the DPO method. To address this issue, we introduce the Direct +Preference for Denoising Diffusion Policy Optimization (D3PO) method to +directly fine-tune diffusion models. The theoretical analysis demonstrates that +although D3PO omits training a reward model, it effectively functions as the +optimal reward model trained using human feedback data to guide the learning +process. This approach requires no training of a reward model, proving to be +more direct, cost-effective, and minimizing computational overhead. In +experiments, our method uses the relative scale of objectives as a proxy for +human preference, delivering comparable results to methods using ground-truth +rewards. Moreover, D3PO demonstrates the ability to reduce image distortion +rates and generate safer images, overcoming challenges lacking robust reward +models. + +
+
+
+
+
+ + ☆ NeutronOrch: Rethinking Sample-based GNN Training under CPU-GPU + Heterogeneous Environments + + +
+ Graph Neural Networks (GNNs) have demonstrated outstanding performance in +various applications. Existing frameworks utilize CPU-GPU heterogeneous +environments to train GNN models and integrate mini-batch and sampling +techniques to overcome the GPU memory limitation. In CPU-GPU heterogeneous +environments, we can divide sample-based GNN training into three steps: sample, +gather, and train. Existing GNN systems use different task orchestrating +methods to employ each step on CPU or GPU. After extensive experiments and +analysis, we find that existing task orchestrating methods fail to fully +utilize the heterogeneous resources, limited by inefficient CPU processing or +GPU resource contention. In this paper, we propose NeutronOrch, a system for +sample-based GNN training that incorporates a layer-based task orchestrating +method and ensures balanced utilization of the CPU and GPU. NeutronOrch +decouples the training process by layer and pushes down the training task of +the bottom layer to the CPU. This significantly reduces the computational load +and memory footprint of GPU training. To avoid inefficient CPU processing, +NeutronOrch only offloads the training of frequently accessed vertices to the +CPU and lets GPU reuse their embeddings with bounded staleness. Furthermore, +NeutronOrch provides a fine-grained pipeline design for the layer-based task +orchestrating method, fully overlapping different tasks on heterogeneous +resources while strictly guaranteeing bounded staleness. The experimental +results show that compared with the state-of-the-art GNN systems, NeutronOrch +can achieve up to 4.61x performance speedup. + +
+
+
+
+
+ + ☆ Cracking the Code of Negative Transfer: A Cooperative Game Theoretic + Approach for Cross-Domain Sequential Recommendation CIKM 2023 + + +
+ This paper investigates Cross-Domain Sequential Recommendation (CDSR), a +promising method that uses information from multiple domains (more than three) +to generate accurate and diverse recommendations, and takes into account the +sequential nature of user interactions. The effectiveness of these systems +often depends on the complex interplay among the multiple domains. In this +dynamic landscape, the problem of negative transfer arises, where heterogeneous +knowledge between dissimilar domains leads to performance degradation due to +differences in user preferences across these domains. As a remedy, we propose a +new CDSR framework that addresses the problem of negative transfer by assessing +the extent of negative transfer from one domain to another and adaptively +assigning low weight values to the corresponding prediction losses. To this +end, the amount of negative transfer is estimated by measuring the marginal +contribution of each domain to model performance based on a cooperative game +theory. In addition, a hierarchical contrastive learning approach that +incorporates information from the sequence of coarse-level categories into that +of fine-level categories (e.g., item level) when implementing contrastive +learning was developed to mitigate negative transfer. Despite the potentially +low relevance between domains at the fine-level, there may be higher relevance +at the category level due to its generalised and broader preferences. We show +that our model is superior to prior works in terms of model performance on two +real-world datasets across ten different domains. + +
+
+ comment: Accepted at 32nd ACM International Conference on Information and + Knowledge Management (CIKM 2023) +
+
+
+
+
+ + ☆ AS-LLM: When Algorithm Selection Meets Large Language Model + + +
+ Algorithm selection aims to identify the most suitable algorithm for solving +a specific problem before execution, which has become a critical process of the +AutoML. Current mainstream algorithm selection techniques rely heavily on +feature representations of various problems and employ the performance of each +algorithm as supervised information. However, there is a significant research +gap concerning the consideration of algorithm features. This gap is primarily +attributed to the inherent complexity of algorithms, making it particularly +challenging to find a universally effective feature extraction method that is +applicable across a diverse range of algorithms. Unfortunately, neglecting this +aspect undoubtedly impacts the accuracy of algorithm selection and indirectly +necessitates an increased volume of problem data for training purposes. This +paper takes a significant stride towards addressing this gap by proposing an +approach that integrates algorithm representation into the algorithm selection +process. Specifically, our proposed model employs distinct modules to extract +representations of both problems and algorithms, where the algorithm +representation leverages the capabilities of pre-trained LLMs in the realm of +code comprehension. Following the extraction of embedding vectors for both +algorithms and problems, the most suitable algorithm is determined through +calculations of matching degrees. Our experiments not only validate the +effectiveness of the proposed model but also showcase the performance of +different embedded pre-trained LLMs, which suggests that the proposed algorithm +selection framework holds the potential to serve as a baseline task for +evaluating the code representation capabilities of LLMs. + +
+
+
+
+
+ + ☆ Provably Efficient High-Dimensional Bandit Learning with Batched + Feedbacks + + +
+ We study high-dimensional multi-armed contextual bandits with batched +feedback where the $T$ steps of online interactions are divided into $L$ +batches. In specific, each batch collects data according to a policy that +depends on previous batches and the rewards are revealed only at the end of the +batch. Such a feedback structure is popular in applications such as +personalized medicine and online advertisement, where the online data often do +not arrive in a fully serial manner. We consider high-dimensional and linear +settings where the reward function of the bandit model admits either a sparse +or low-rank structure and ask how small a number of batches are needed for a +comparable performance with fully dynamic data in which $L = T$. For these +settings, we design a provably sample-efficient algorithm which achieves a $ +\mathcal{\tilde O}(s_0^2 \log^2 T)$ regret in the sparse case and $ +\mathcal{\tilde O} ( r ^2 \log^2 T)$ regret in the low-rank case, using only $L += \mathcal{O}( \log T)$ batches. Here $s_0$ and $r$ are the sparsity and rank +of the reward parameter in sparse and low-rank cases, respectively, and $ +\mathcal{\tilde O}(\cdot)$ omits logarithmic factors involving the feature +dimensions. In other words, our algorithm achieves regret bounds comparable to +those in fully sequential setting with only $\mathcal{O}( \log T)$ batches. Our +algorithm features a novel batch allocation method that adjusts the batch sizes +according to the estimation accuracy within each batch and cumulative regret. +Furthermore, we also conduct experiments with synthetic and real-world data to +validate our theory. + +
+
+
+
+
+ + ☆ SecureCut: Federated Gradient Boosting Decision Trees with Efficient + Machine Unlearning + + +
+ In response to legislation mandating companies to honor the \textit{right to +be forgotten} by erasing user data, it has become imperative to enable data +removal in Vertical Federated Learning (VFL) where multiple parties provide +private features for model training. In VFL, data removal, i.e., +\textit{machine unlearning}, often requires removing specific features across +all samples under privacy guarentee in federated learning. To address this +challenge, we propose \methname, a novel Gradient Boosting Decision Tree (GBDT) +framework that effectively enables both \textit{instance unlearning} and +\textit{feature unlearning} without the need for retraining from scratch. +Leveraging a robust GBDT structure, we enable effective data deletion while +reducing degradation of model performance. Extensive experimental results on +popular datasets demonstrate that our method achieves superior model utility +and forgetfulness compared to \textit{state-of-the-art} methods. To our best +knowledge, this is the first work that investigates machine unlearning in VFL +scenarios. + +
+
+
+
+
+ + ☆ ComPEFT: Compression for Communicating Parameter Efficient Updates via + Sparsification and Quantization + + +
+ Parameter-efficient fine-tuning (PEFT) techniques make it possible to +efficiently adapt a language model to create "expert" models that specialize to +new tasks or domains. Recent techniques in model merging and compositional +generalization leverage these expert models by dynamically composing modules to +improve zero/few-shot generalization. Despite the efficiency of PEFT methods, +the size of expert models can make it onerous to retrieve expert models per +query over high-latency networks like the Internet or serve multiple experts on +a single GPU. To address these issues, we present ComPEFT, a novel method for +compressing fine-tuning residuals (task vectors) of PEFT based models. ComPEFT +employs sparsification and ternary quantization to reduce the size of the PEFT +module without performing any additional retraining while preserving or +enhancing model performance. In extensive evaluation across T5, T0, and +LLaMA-based models with 200M - 65B parameters, ComPEFT achieves compression +ratios of 8x - 50x. In particular, we show that ComPEFT improves with scale - +stronger models exhibit higher compressibility and better performance. For +example, we show that ComPEFT applied to LLaMA outperforms QLoRA by 4.16% on +MMLU with a storage size reduction of up to 26x. In addition, we show that the +compressed experts produced by ComPEFT maintain few-shot compositional +generalization capabilities, facilitate efficient communication and +computation, and exhibit enhanced performance when merged. Lastly, we provide +an analysis of different method components, compare it with other PEFT methods, +and test ComPEFT's efficacy for compressing the residual of full-finetuning. +Our code is available at https://github.com/prateeky2806/compeft. + +
+
+ comment: 25 Pages, 6 Figures, 16 Tables +
+
+
+
+
+ + ☆ SiGeo: Sub-One-Shot NAS via Information Theory and Geometry of Loss + Landscape + + +
+ Neural Architecture Search (NAS) has become a widely used tool for automating +neural network design. While one-shot NAS methods have successfully reduced +computational requirements, they often require extensive training. On the other +hand, zero-shot NAS utilizes training-free proxies to evaluate a candidate +architecture's test performance but has two limitations: (1) inability to use +the information gained as a network improves with training and (2) unreliable +performance, particularly in complex domains like RecSys, due to the +multi-modal data inputs and complex architecture configurations. To synthesize +the benefits of both methods, we introduce a "sub-one-shot" paradigm that +serves as a bridge between zero-shot and one-shot NAS. In sub-one-shot NAS, the +supernet is trained using only a small subset of the training data, a phase we +refer to as "warm-up." Within this framework, we present SiGeo, a proxy founded +on a novel theoretical framework that connects the supernet warm-up with the +efficacy of the proxy. Extensive experiments have shown that SiGeo, with the +benefit of warm-up, consistently outperforms state-of-the-art NAS proxies on +various established NAS benchmarks. When a supernet is warmed up, it can +achieve comparable performance to weight-sharing one-shot NAS methods, but with +a significant reduction ($\sim 60$\%) in computational costs. + +
+
+ comment: 24 pages, 7 figures +
+
+
+
+
+ + ☆ AdaptiveFL: Adaptive Heterogeneous Federated Learning for + Resource-Constrained AIoT Systems + + +
+ Although Federated Learning (FL) is promising to enable collaborative +learning among Artificial Intelligence of Things (AIoT) devices, it suffers +from the problem of low classification performance due to various heterogeneity +factors (e.g., computing capacity, memory size) of devices and uncertain +operating environments. To address these issues, this paper introduces an +effective FL approach named AdaptiveFL based on a novel fine-grained width-wise +model pruning strategy, which can generate various heterogeneous local models +for heterogeneous AIoT devices. By using our proposed reinforcement +learning-based device selection mechanism, AdaptiveFL can adaptively dispatch +suitable heterogeneous models to corresponding AIoT devices on the fly based on +their available resources for local training. Experimental results show that, +compared to state-of-the-art methods, AdaptiveFL can achieve up to 16.83% +inference improvements for both IID and non-IID scenarios. + +
+
+
+
+
+ + ☆ Have Your Cake and Eat It Too: Toward Efficient and Accurate Split + Federated Learning + + +
+ Due to its advantages in resource constraint scenarios, Split Federated +Learning (SFL) is promising in AIoT systems. However, due to data heterogeneity +and stragglers, SFL suffers from the challenges of low inference accuracy and +low efficiency. To address these issues, this paper presents a novel SFL +approach, named Sliding Split Federated Learning (S$^2$FL), which adopts an +adaptive sliding model split strategy and a data balance-based training +mechanism. By dynamically dispatching different model portions to AIoT devices +according to their computing capability, S$^2$FL can alleviate the low training +efficiency caused by stragglers. By combining features uploaded by devices with +different data distributions to generate multiple larger batches with a uniform +distribution for back-propagation, S$^2$FL can alleviate the performance +degradation caused by data heterogeneity. Experimental results demonstrate +that, compared to conventional SFL, S$^2$FL can achieve up to 16.5\% inference +accuracy improvement and 3.54X training acceleration. + +
+
+
+
+
+ + ☆ Multi-Objective Optimization via Wasserstein-Fisher-Rao Gradient Flow + + +
+ Multi-objective optimization (MOO) aims to optimize multiple, possibly +conflicting objectives with widespread applications. We introduce a novel +interacting particle method for MOO inspired by molecular dynamics simulations. +Our approach combines overdamped Langevin and birth-death dynamics, +incorporating a "dominance potential" to steer particles toward global Pareto +optimality. In contrast to previous methods, our method is able to relocate +dominated particles, making it particularly adept at managing Pareto fronts of +complicated geometries. Our method is also theoretically grounded as a +Wasserstein-Fisher-Rao gradient flow with convergence guarantees. Extensive +experiments confirm that our approach outperforms state-of-the-art methods on +challenging synthetic and real-world datasets. + +
+
+
+
+
+ + ☆ Testing Closeness of Multivariate Distributions via Ramsey Theory + + +
+ We investigate the statistical task of closeness (or equivalence) testing for +multidimensional distributions. Specifically, given sample access to two +unknown distributions $\mathbf p, \mathbf q$ on $\mathbb R^d$, we want to +distinguish between the case that $\mathbf p=\mathbf q$ versus $\|\mathbf +p-\mathbf q\|_{A_k} > \epsilon$, where $\|\mathbf p-\mathbf q\|_{A_k}$ denotes +the generalized ${A}_k$ distance between $\mathbf p$ and $\mathbf q$ -- +measuring the maximum discrepancy between the distributions over any collection +of $k$ disjoint, axis-aligned rectangles. Our main result is the first +closeness tester for this problem with {\em sub-learning} sample complexity in +any fixed dimension and a nearly-matching sample complexity lower bound. + In more detail, we provide a computationally efficient closeness tester with +sample complexity $O\left((k^{6/7}/ \mathrm{poly}_d(\epsilon)) +\log^d(k)\right)$. On the lower bound side, we establish a qualitatively +matching sample complexity lower bound of +$\Omega(k^{6/7}/\mathrm{poly}(\epsilon))$, even for $d=2$. These sample +complexity bounds are surprising because the sample complexity of the problem +in the univariate setting is $\Theta(k^{4/5}/\mathrm{poly}(\epsilon))$. This +has the interesting consequence that the jump from one to two dimensions leads +to a substantial increase in sample complexity, while increases beyond that do +not. + As a corollary of our general $A_k$ tester, we obtain $d_{\mathrm +TV}$-closeness testers for pairs of $k$-histograms on $\mathbb R^d$ over a +common unknown partition, and pairs of uniform distributions supported on the +union of $k$ unknown disjoint axis-aligned rectangles. + Both our algorithm and our lower bound make essential use of tools from +Ramsey theory. + +
+
+
+
+
+ + ☆ Optimal Transport with Cyclic Symmetry + + +
+ We propose novel fast algorithms for optimal transport (OT) utilizing a +cyclic symmetry structure of input data. Such OT with cyclic symmetry appears +universally in various real-world examples: image processing, urban planning, +and graph processing. Our main idea is to reduce OT to a small optimization +problem that has significantly fewer variables by utilizing cyclic symmetry and +various optimization techniques. On the basis of this reduction, our algorithms +solve the small optimization problem instead of the original OT. As a result, +our algorithms obtain the optimal solution and the objective function value of +the original OT faster than solving the original OT directly. In this paper, +our focus is on two crucial OT formulations: the linear programming OT (LOT) +and the strongly convex-regularized OT, which includes the well-known +entropy-regularized OT (EROT). Experiments show the effectiveness of our +algorithms for LOT and EROT in synthetic/real-world data that has a +strict/approximate cyclic symmetry structure. Through theoretical and +experimental results, this paper successfully introduces the concept of +symmetry into the OT research field for the first time. + +
+
+
+
+
+ + ☆ LIMIT: Less Is More for Instruction Tuning Across Evaluation Paradigms NeurIPS 2023 + + +
+ Large Language Models are traditionally finetuned on large instruction +datasets. However recent studies suggest that small, high-quality datasets can +suffice for general purpose instruction following. This lack of consensus +surrounding finetuning best practices is in part due to rapidly diverging +approaches to LLM evaluation. In this study, we ask whether a small amount of +diverse finetuning samples can improve performance on both traditional +perplexity-based NLP benchmarks, and on open-ended, model-based evaluation. We +finetune open-source MPT-7B and MPT-30B models on instruction finetuning +datasets of various sizes ranging from 1k to 60k samples. We find that subsets +of 1k-6k instruction finetuning samples are sufficient to achieve good +performance on both (1) traditional NLP benchmarks and (2) model-based +evaluation. Finally, we show that mixing textbook-style and open-ended QA +finetuning datasets optimizes performance on both evaluation paradigms. + +
+
+ comment: 36 pages, 12 figures, NeurIPS 2023 Workshop on Instruction Tuning and + Instruction Following +
+
+
+
+
+ + ☆ Combatting Human Trafficking in the Cyberspace: A Natural Language + Processing-Based Methodology to Analyze the Language in Online Advertisements + + +
+ This project tackles the pressing issue of human trafficking in online C2C +marketplaces through advanced Natural Language Processing (NLP) techniques. We +introduce a novel methodology for generating pseudo-labeled datasets with +minimal supervision, serving as a rich resource for training state-of-the-art +NLP models. Focusing on tasks like Human Trafficking Risk Prediction (HTRP) and +Organized Activity Detection (OAD), we employ cutting-edge Transformer models +for analysis. A key contribution is the implementation of an interpretability +framework using Integrated Gradients, providing explainable insights crucial +for law enforcement. This work not only fills a critical gap in the literature +but also offers a scalable, machine learning-driven approach to combat human +exploitation online. It serves as a foundation for future research and +practical applications, emphasizing the role of machine learning in addressing +complex social issues. + +
+
+
+
+
+ + ☆ White-Box Transformers via Sparse Rate Reduction: Compression Is All + There Is? + + +
+ In this paper, we contend that a natural objective of representation learning +is to compress and transform the distribution of the data, say sets of tokens, +towards a low-dimensional Gaussian mixture supported on incoherent subspaces. +The goodness of such a representation can be evaluated by a principled measure, +called sparse rate reduction, that simultaneously maximizes the intrinsic +information gain and extrinsic sparsity of the learned representation. From +this perspective, popular deep network architectures, including transformers, +can be viewed as realizing iterative schemes to optimize this measure. +Particularly, we derive a transformer block from alternating optimization on +parts of this objective: the multi-head self-attention operator compresses the +representation by implementing an approximate gradient descent step on the +coding rate of the features, and the subsequent multi-layer perceptron +sparsifies the features. This leads to a family of white-box transformer-like +deep network architectures, named CRATE, which are mathematically fully +interpretable. We show, by way of a novel connection between denoising and +compression, that the inverse to the aforementioned compressive encoding can be +realized by the same class of CRATE architectures. Thus, the so-derived +white-box architectures are universal to both encoders and decoders. +Experiments show that these networks, despite their simplicity, indeed learn to +compress and sparsify representations of large-scale real-world image and text +datasets, and achieve performance very close to highly engineered +transformer-based models: ViT, MAE, DINO, BERT, and GPT2. We believe the +proposed computational framework demonstrates great potential in bridging the +gap between theory and practice of deep learning, from a unified perspective of +data compression. Code is available at: https://ma-lab-berkeley.github.io/CRATE . + +
+
+ comment: This paper integrates the works arXiv:2306.01129 and + arXiv:2308.16271, as well as this under-review work: + https://openreview.net/forum?id=PvyOYleymy into a complete story. In this + paper, we improve the writing and organization, and also add conceptual, + empirical, and theoretical improvements over the previous work +
+
+
+
+
+ + ☆ Detecting out-of-distribution text using topological features of + transformer-based language models + + +
+ We attempt to detect out-of-distribution (OOD) text samples though applying +Topological Data Analysis (TDA) to attention maps in transformer-based language +models. We evaluate our proposed TDA-based approach for out-of-distribution +detection on BERT, a transformer-based language model, and compare the to a +more traditional OOD approach based on BERT CLS embeddings. We found that our +TDA approach outperforms the CLS embedding approach at distinguishing +in-distribution data (politics and entertainment news articles from HuffPost) +from far out-of-domain samples (IMDB reviews), but its effectiveness +deteriorates with near out-of-domain (CNN/Dailymail) or same-domain (business +news articles from HuffPost) datasets. + +
+
+ comment: 12 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ PIE-NeRF: Physics-based Interactive Elastodynamics with NeRF + + +
+ We show that physics-based simulations can be seamlessly integrated with NeRF +to generate high-quality elastodynamics of real-world objects. Unlike existing +methods, we discretize nonlinear hyperelasticity in a meshless way, obviating +the necessity for intermediate auxiliary shape proxies like a tetrahedral mesh +or voxel grid. A quadratic generalized moving least square (Q-GMLS) is employed +to capture nonlinear dynamics and large deformation on the implicit model. Such +meshless integration enables versatile simulations of complex and codimensional +shapes. We adaptively place the least-square kernels according to the NeRF +density field to significantly reduce the complexity of the nonlinear +simulation. As a result, physically realistic animations can be conveniently +synthesized using our method for a wide range of hyperelastic materials at an +interactive rate. For more information, please visit our project page at +https://fytalon.github.io/pienerf/. + +
+
+
+
+
+ + ☆ Newton-CG methods for nonconvex unconstrained optimization with Hölder + continuous Hessian + + +
+ In this paper we consider a nonconvex unconstrained optimization problem +minimizing a twice differentiable objective function with H\"older continuous +Hessian. Specifically, we first propose a Newton-conjugate gradient (Newton-CG) +method for finding an approximate first-order stationary point (FOSP) of this +problem, assuming the associated the H\"older parameters are explicitly known. +Then we develop a parameter-free Newton-CG method without requiring any prior +knowledge of these parameters. To the best of our knowledge, this method is the +first parameter-free second-order method achieving the best-known iteration and +operation complexity for finding an approximate FOSP of this problem. +Furthermore, we propose a Newton-CG method for finding an approximate +second-order stationary point (SOSP) of the considered problem with high +probability and establish its iteration and operation complexity. Finally, we +present preliminary numerical results to demonstrate the superior practical +performance of our parameter-free Newton-CG method over a well-known +regularized Newton method. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2301.03139 +
+
+
+
+
+ + ☆ Stable Unlearnable Example: Enhancing the Robustness of Unlearnable + Examples via Stable Error-Minimizing Noise + + +
+ The open source of large amounts of image data promotes the development of +deep learning techniques. Along with this comes the privacy risk of these +open-source image datasets being exploited by unauthorized third parties to +train deep learning models for commercial or illegal purposes. To avoid the +abuse of public data, a poisoning-based technique, the unlearnable example, is +proposed to significantly degrade the generalization performance of models by +adding a kind of imperceptible noise to the data. To further enhance its +robustness against adversarial training, existing works leverage iterative +adversarial training on both the defensive noise and the surrogate model. +However, it still remains unknown whether the robustness of unlearnable +examples primarily comes from the effect of enhancement in the surrogate model +or the defensive noise. Observing that simply removing the adversarial noise on +the training process of the defensive noise can improve the performance of +robust unlearnable examples, we identify that solely the surrogate model's +robustness contributes to the performance. Furthermore, we found a negative +correlation exists between the robustness of defensive noise and the protection +performance, indicating defensive noise's instability issue. Motivated by this, +to further boost the robust unlearnable example, we introduce stable +error-minimizing noise (SEM), which trains the defensive noise against random +perturbation instead of the time-consuming adversarial perturbation to improve +the stability of defensive noise. Through extensive experiments, we demonstrate +that SEM achieves a new state-of-the-art performance on CIFAR-10, CIFAR-100, +and ImageNet Subset in terms of both effectiveness and efficiency. The code is +available at https://github.com/liuyixin-louis/Stable-Unlearnable-Example. + +
+
+ comment: 14 pages, 11 figures, 13 tables +
+
+
+
+
+ + ☆ Predict-Then-Optimize by Proxy: Learning Joint Models of Prediction and + Optimization + + +
+ Many real-world decision processes are modeled by optimization problems whose +defining parameters are unknown and must be inferred from observable data. The +Predict-Then-Optimize framework uses machine learning models to predict unknown +parameters of an optimization problem from features before solving. Recent +works show that decision quality can be improved in this setting by solving and +differentiating the optimization problem in the training loop, enabling +end-to-end training with loss functions defined directly on the resulting +decisions. However, this approach can be inefficient and requires handcrafted, +problem-specific rules for backpropagation through the optimization step. This +paper proposes an alternative method, in which optimal solutions are learned +directly from the observable features by predictive models. The approach is +generic, and based on an adaptation of the Learning-to-Optimize paradigm, from +which a rich variety of existing techniques can be employed. Experimental +evaluations show the ability of several Learning-to-Optimize methods to provide +efficient, accurate, and flexible solutions to an array of challenging +Predict-Then-Optimize problems. + +
+
+
+
+
+ + ☆ Learning to Fly in Seconds + + +
+ Learning-based methods, particularly Reinforcement Learning (RL), hold great +promise for streamlining deployment, enhancing performance, and achieving +generalization in the control of autonomous multirotor aerial vehicles. Deep RL +has been able to control complex systems with impressive fidelity and agility +in simulation but the simulation-to-reality transfer often brings a +hard-to-bridge reality gap. Moreover, RL is commonly plagued by prohibitively +long training times. In this work, we propose a novel asymmetric +actor-critic-based architecture coupled with a highly reliable RL-based +training paradigm for end-to-end quadrotor control. We show how curriculum +learning and a highly optimized simulator enhance sample complexity and lead to +fast training times. To precisely discuss the challenges related to +low-level/end-to-end multirotor control, we also introduce a taxonomy that +classifies the existing levels of control abstractions as well as +non-linearities and domain parameters. Our framework enables +Simulation-to-Reality (Sim2Real) transfer for direct RPM control after only 18 +seconds of training on a consumer-grade laptop as well as its deployment on +microcontrollers to control a multirotor under real-time guarantees. Finally, +our solution exhibits competitive performance in trajectory tracking, as +demonstrated through various experimental comparisons with existing +state-of-the-art control solutions using a real Crazyflie nano quadrotor. We +open source the code including a very fast multirotor dynamics simulator that +can simulate about 5 months of flight per second on a laptop GPU. The fast +training times and deployment to a cheap, off-the-shelf quadrotor lower the +barriers to entry and help democratize the research and development of these +systems. + +
+
+
+
+
+ + ☆ FusionFrames: Efficient Architectural Aspects for Text-to-Video + Generation Pipeline + + +
+ Multimedia generation approaches occupy a prominent place in artificial +intelligence research. Text-to-image models achieved high-quality results over +the last few years. However, video synthesis methods recently started to +develop. This paper presents a new two-stage latent diffusion text-to-video +generation architecture based on the text-to-image diffusion model. The first +stage concerns keyframes synthesis to figure the storyline of a video, while +the second one is devoted to interpolation frames generation to make movements +of the scene and objects smooth. We compare several temporal conditioning +approaches for keyframes generation. The results show the advantage of using +separate temporal blocks over temporal layers in terms of metrics reflecting +video generation quality aspects and human preference. The design of our +interpolation model significantly reduces computational costs compared to other +masked frame interpolation approaches. Furthermore, we evaluate different +configurations of MoVQ-based video decoding scheme to improve consistency and +achieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our +pipeline with existing solutions and achieve top-2 scores overall and top-1 +among open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page: +https://ai-forever.github.io/kandinsky-video/ + +
+
+ comment: Project page: https://ai-forever.github.io/kandinsky-video/ +
+
+
+
+
+ + ☆ Grad-Shafranov equilibria via data-free physics informed neural networks + + +
+ A large number of magnetohydrodynamic (MHD) equilibrium calculations are +often required for uncertainty quantification, optimization, and real-time +diagnostic information, making MHD equilibrium codes vital to the field of +plasma physics. In this paper, we explore a method for solving the +Grad-Shafranov equation by using Physics-Informed Neural Networks (PINNs). For +PINNs, we optimize neural networks by directly minimizing the residual of the +PDE as a loss function. We show that PINNs can accurately and effectively solve +the Grad-Shafranov equation with several different boundary conditions. We also +explore the parameter space by varying the size of the model, the learning +rate, and boundary conditions to map various trade-offs such as between +reconstruction error and computational speed. Additionally, we introduce a +parameterized PINN framework, expanding the input space to include variables +such as pressure, aspect ratio, elongation, and triangularity in order to +handle a broader range of plasma scenarios within a single network. +Parametrized PINNs could be used in future work to solve inverse problems such +as shape optimization. + +
+
+
+
+
+ + ♻ ☆ Prediction of Effective Elastic Moduli of Rocks using Graph Neural + Networks + + +
+ This study presents a Graph Neural Networks (GNNs)-based approach for +predicting the effective elastic moduli of rocks from their digital CT-scan +images. We use the Mapper algorithm to transform 3D digital rock images into +graph datasets, encapsulating essential geometrical information. These graphs, +after training, prove effective in predicting elastic moduli. Our GNN model +shows robust predictive capabilities across various graph sizes derived from +various subcube dimensions. Not only does it perform well on the test dataset, +but it also maintains high prediction accuracy for unseen rocks and unexplored +subcube sizes. Comparative analysis with Convolutional Neural Networks (CNNs) +reveals the superior performance of GNNs in predicting unseen rock properties. +Moreover, the graph representation of microstructures significantly reduces GPU +memory requirements (compared to the grid representation for CNNs), enabling +greater flexibility in the batch size selection. This work demonstrates the +potential of GNN models in enhancing the prediction accuracy of rock properties +and boosting the efficiency of digital rock analysis. + +
+
+
+
+
+ + ♻ ☆ Edge2Node: Reducing Edge Prediction to Node Classification + + +
+ Despite the success of graph neural network models in node classification, +edge prediction (the task of predicting missing or potential links between +nodes in a graph) remains a challenging problem for these models. A common +approach for edge prediction is to first obtain the embeddings of two nodes, +and then a predefined scoring function is used to predict the existence of an +edge between the two nodes. Here, we introduce a preliminary idea called +Edge2Node which suggests to directly obtain an embedding for each edge, without +the need for a scoring function. This idea wants to create a new graph H based +on the graph G given for the edge prediction task, and then suggests reducing +the edge prediction task on G to a node classification task on H. We anticipate +that this introductory method could stimulate further investigations for edge +prediction task. + +
+
+
+
+
+ + ♻ ☆ A Good Feature Extractor Is All You Need for Weakly Supervised Learning + in Histopathology + + +
+ Deep learning is revolutionising pathology, offering novel opportunities in +disease prognosis and personalised treatment. Historically, stain normalisation +has been a crucial preprocessing step in computational pathology pipelines, and +persists into the deep learning era. Yet, with the emergence of feature +extractors trained using self-supervised learning (SSL) on diverse pathology +datasets, we call this practice into question. In an empirical evaluation of +publicly available feature extractors, we find that omitting stain +normalisation and image augmentations does not compromise downstream +performance, while incurring substantial savings in memory and compute. +Further, we show that the top-performing feature extractors are remarkably +robust to variations in stain and augmentations like rotation in their latent +space. Contrary to previous patch-level benchmarking studies, our approach +emphasises clinical relevance by focusing on slide-level prediction tasks in a +weakly supervised setting with external validation cohorts. This work +represents the most comprehensive robustness evaluation of public pathology SSL +feature extractors to date, involving more than 6,000 training runs across nine +tasks, five datasets, three downstream architectures, and various preprocessing +setups. Our findings stand to streamline digital pathology workflows by +minimising preprocessing needs and informing the selection of feature +extractors. + +
+
+
+
+
+ + ♻ ☆ GraphCFC: A Directed Graph Based Cross-Modal Feature Complementation + Approach for Multimodal Conversational Emotion Recognition + + +
+ Emotion Recognition in Conversation (ERC) plays a significant part in +Human-Computer Interaction (HCI) systems since it can provide empathetic +services. Multimodal ERC can mitigate the drawbacks of uni-modal approaches. +Recently, Graph Neural Networks (GNNs) have been widely used in a variety of +fields due to their superior performance in relation modeling. In multimodal +ERC, GNNs are capable of extracting both long-distance contextual information +and inter-modal interactive information. Unfortunately, since existing methods +such as MMGCN directly fuse multiple modalities, redundant information may be +generated and diverse information may be lost. In this work, we present a +directed Graph based Cross-modal Feature Complementation (GraphCFC) module that +can efficiently model contextual and interactive information. GraphCFC +alleviates the problem of heterogeneity gap in multimodal fusion by utilizing +multiple subspace extractors and Pair-wise Cross-modal Complementary (PairCC) +strategy. We extract various types of edges from the constructed graph for +encoding, thus enabling GNNs to extract crucial contextual and interactive +information more accurately when performing message passing. Furthermore, we +design a GNN structure called GAT-MLP, which can provide a new unified network +framework for multimodal learning. The experimental results on two benchmark +datasets show that our GraphCFC outperforms the state-of-the-art (SOTA) +approaches. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ♻ ☆ Tensor Train for Global Optimization Problems in Robotics + + +
+ The convergence of many numerical optimization techniques is highly dependent +on the initial guess given to the solver. To address this issue, we propose a +novel approach that utilizes tensor methods to initialize existing optimization +solvers near global optima. Our method does not require access to a database of +good solutions. We first transform the cost function, which depends on both +task parameters and optimization variables, into a probability density +function. Unlike existing approaches, the joint probability distribution of the +task parameters and optimization variables is approximated using the Tensor +Train model, which enables efficient conditioning and sampling. We treat the +task parameters as random variables, and for a given task, we generate samples +for decision variables from the conditional distribution to initialize the +optimization solver. Our method can produce multiple solutions (when they +exist) faster than existing methods. We first evaluate the approach on +benchmark functions for numerical optimization that are hard to solve using +gradient-based optimization solvers with a naive initialization. The results +show that the proposed method can generate samples close to global optima and +from multiple modes. We then demonstrate the generality and relevance of our +framework to robotics by applying it to inverse kinematics with obstacles and +motion planning problems with a 7-DoF manipulator. + +
+
+ comment: 25 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ Integrating Pre-trained Language Model into Neural Machine Translation + + +
+ Neural Machine Translation (NMT) has become a significant technology in +natural language processing through extensive research and development. +However, the deficiency of high-quality bilingual language pair data still +poses a major challenge to improving NMT performance. Recent studies have been +exploring the use of contextual information from pre-trained language model +(PLM) to address this problem. Yet, the issue of incompatibility between PLM +and NMT model remains unresolved. This study proposes PLM-integrated NMT +(PiNMT) model to overcome the identified problems. PiNMT model consists of +three critical components, PLM Multi Layer Converter, Embedding Fusion, and +Cosine Alignment, each playing a vital role in providing effective PLM +information to NMT. Furthermore, two training strategies, Separate Learning +Rates and Dual Step Training, are also introduced in this paper. By +implementing the proposed PiNMT model and training strategy, we achieve +state-of-the-art performance on the IWSLT'14 En$\leftrightarrow$De dataset. +This study's outcomes are noteworthy as they demonstrate a novel approach for +efficiently integrating PLM with NMT to overcome incompatibility and enhance +performance. + +
+
+
+
+
+ + ♻ ☆ DNA-TEQ: An Adaptive Exponential Quantization of Tensors for DNN + Inference + + +
+ Quantization is commonly used in Deep Neural Networks (DNNs) to reduce the +storage and computational complexity by decreasing the arithmetical precision +of activations and weights, a.k.a. tensors. Efficient hardware architectures +employ linear quantization to enable the deployment of recent DNNs onto +embedded systems and mobile devices. However, linear uniform quantization +cannot usually reduce the numerical precision to less than 8 bits without +sacrificing high performance in terms of model accuracy. The performance loss +is due to the fact that tensors do not follow uniform distributions. In this +paper, we show that a significant amount of tensors fit into an exponential +distribution. Then, we propose DNA-TEQ to exponentially quantize DNN tensors +with an adaptive scheme that achieves the best trade-off between numerical +precision and accuracy loss. The experimental results show that DNA-TEQ +provides a much lower quantization bit-width compared to previous proposals, +resulting in an average compression ratio of 40% over the linear INT8 baseline, +with negligible accuracy loss and without retraining the DNNs. Besides, DNA-TEQ +leads the way in performing dot-product operations in the exponential domain, +which saves 66% of energy consumption on average for a set of widely used DNNs. + +
+
+ comment: 10 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ A principled deep learning approach for geological facies generation + + +
+ The simulation of geological facies in an unobservable volume is essential in +various geoscience applications. Given the complexity of the problem, deep +generative learning is a promising approach to overcome the limitations of +traditional geostatistical simulation models, in particular their lack of +physical realism. This research aims to investigate the application of +generative adversarial networks and deep variational inference for +conditionally simulating meandering channels in underground volumes. In this +paper, we review the generative deep learning approaches, in particular the +adversarial ones and the stabilization techniques that aim to facilitate their +training. The proposed approach is tested on 2D and 3D simulations generated by +the stochastic process-based model Flumy. Morphological metrics are utilized to +compare our proposed method with earlier iterations of generative adversarial +networks. The results indicate that by utilizing recent stabilization +techniques, generative adversarial networks can efficiently sample from target +data distributions. Moreover, we demonstrate the ability to simulate +conditioned simulations through the latent variable model property of the +proposed approach. + +
+
+
+
+
+ + ♻ ☆ Leveraging Different Learning Styles for Improved Knowledge Distillation + in Biomedical Imaging + + +
+ Learning style refers to a type of training mechanism adopted by an +individual to gain new knowledge. As suggested by the VARK model, humans have +different learning preferences, like Visual (V), Auditory (A), Read/Write (R), +and Kinesthetic (K), for acquiring and effectively processing information. Our +work endeavors to leverage this concept of knowledge diversification to improve +the performance of model compression techniques like Knowledge Distillation +(KD) and Mutual Learning (ML). Consequently, we use a single-teacher and +two-student network in a unified framework that not only allows for the +transfer of knowledge from teacher to students (KD) but also encourages +collaborative learning between students (ML). Unlike the conventional approach, +where the teacher shares the same knowledge in the form of predictions or +feature representations with the student network, our proposed approach employs +a more diversified strategy by training one student with predictions and the +other with feature maps from the teacher. We further extend this knowledge +diversification by facilitating the exchange of predictions and feature maps +between the two student networks, enriching their learning experiences. We have +conducted comprehensive experiments with three benchmark datasets for both +classification and segmentation tasks using two different network architecture +combinations. These experimental results demonstrate that knowledge +diversification in a combined KD and ML framework outperforms conventional KD +or ML techniques (with similar network configuration) that only use predictions +with an average improvement of 2%. Furthermore, consistent improvement in +performance across different tasks, with various network architectures, and +over state-of-the-art techniques establishes the robustness and +generalizability of the proposed model + +
+
+ comment: Accepted in Computers in Biology and Medicine +
+
+
+
+
+ + ♻ ☆ From Principle to Practice: Vertical Data Minimization for Machine + Learning + + +
+ Aiming to train and deploy predictive models, organizations collect large +amounts of detailed client data, risking the exposure of private information in +the event of a breach. To mitigate this, policymakers increasingly demand +compliance with the data minimization (DM) principle, restricting data +collection to only that data which is relevant and necessary for the task. +Despite regulatory pressure, the problem of deploying machine learning models +that obey DM has so far received little attention. In this work, we address +this challenge in a comprehensive manner. We propose a novel vertical DM (vDM) +workflow based on data generalization, which by design ensures that no +full-resolution client data is collected during training and deployment of +models, benefiting client privacy by reducing the attack surface in case of a +breach. We formalize and study the corresponding problem of finding +generalizations that both maximize data utility and minimize empirical privacy +risk, which we quantify by introducing a diverse set of policy-aligned +adversarial scenarios. Finally, we propose a range of baseline vDM algorithms, +as well as Privacy-aware Tree (PAT), an especially effective vDM algorithm that +outperforms all baselines across several settings. We plan to release our code +as a publicly available library, helping advance the standardization of DM for +machine learning. Overall, we believe our work can help lay the foundation for +further exploration and adoption of DM principles in real-world applications. + +
+
+ comment: Accepted at IEEE S&P 2024 +
+
+
+
+
+ + ♻ ☆ Confident Naturalness Explanation (CNE): A Framework to Explain and + Assess Patterns Forming Naturalness + + +
+ Protected natural areas are regions that have been minimally affected by +human activities such as urbanization, agriculture, and other human +interventions. To better understand and map the naturalness of these areas, +machine learning models can be used to analyze satellite imagery. Specifically, +explainable machine learning methods show promise in uncovering patterns that +contribute to the concept of naturalness within these protected environments. +Additionally, addressing the uncertainty inherent in machine learning models is +crucial for a comprehensive understanding of this concept. However, existing +approaches have limitations. They either fail to provide explanations that are +both valid and objective or struggle to offer a quantitative metric that +accurately measures the contribution of specific patterns to naturalness, along +with the associated confidence. In this paper, we propose a novel framework +called the Confident Naturalness Explanation (CNE) framework. This framework +combines explainable machine learning and uncertainty quantification to assess +and explain naturalness. We introduce a new quantitative metric that describes +the confident contribution of patterns to the concept of naturalness. +Furthermore, we generate an uncertainty-aware segmentation mask for each input +sample, highlighting areas where the model lacks knowledge. To demonstrate the +effectiveness of our framework, we apply it to a study site in Fennoscandia +using two open-source satellite datasets. + +
+
+
+
+
+ + ♻ ☆ The effect of speech pathology on automatic speaker verification -- a + large-scale study + + +
+ Navigating the challenges of data-driven speech processing, one of the +primary hurdles is accessing reliable pathological speech data. While public +datasets appear to offer solutions, they come with inherent risks of potential +unintended exposure of patient health information via re-identification +attacks. Using a comprehensive real-world pathological speech corpus, with over +n=3,800 test subjects spanning various age groups and speech disorders, we +employed a deep-learning-driven automatic speaker verification (ASV) approach. +This resulted in a notable mean equal error rate (EER) of 0.89% with a standard +deviation of 0.06%, outstripping traditional benchmarks. Our comprehensive +assessments demonstrate that pathological speech overall faces heightened +privacy breach risks compared to healthy speech. Specifically, adults with +dysphonia are at heightened re-identification risks, whereas conditions like +dysarthria yield results comparable to those of healthy speakers. Crucially, +speech intelligibility does not influence the ASV system's performance metrics. +In pediatric cases, particularly those with cleft lip and palate, the recording +environment plays a decisive role in re-identification. Merging data across +pathological types led to a marked EER decrease, suggesting the potential +benefits of pathological diversity in ASV, accompanied by a logarithmic boost +in ASV effectiveness. In essence, this research sheds light on the dynamics +between pathological speech and speaker verification, emphasizing its crucial +role in safeguarding patient confidentiality in our increasingly digitized +healthcare era. + +
+
+ comment: Published in Scientific Reports +
+
+
+
+
+ + ♻ ☆ Hinge-Wasserstein: Mitigating Overconfidence in Regression by + Classification + + +
+ Computer vision systems that are deployed in safety-critical applications +need to quantify their output uncertainty. We study regression from images to +parameter values and here it is common to detect uncertainty by predicting +probability distributions. In this context, we investigate the +regression-by-classification paradigm which can represent multimodal +distributions, without a prior assumption on the number of modes. Through +experiments on a specifically designed synthetic dataset, we demonstrate that +traditional loss functions lead to poor probability distribution estimates and +severe overconfidence, in the absence of full ground truth distributions. In +order to alleviate these issues, we propose hinge-Wasserstein -- a simple +improvement of the Wasserstein loss that reduces the penalty for weak secondary +modes during training. This enables prediction of complex distributions with +multiple modes, and allows training on datasets where full ground truth +distributions are not available. In extensive experiments, we show that the +proposed loss leads to substantially better uncertainty estimation on two +challenging computer vision tasks: horizon line detection and stereo disparity +estimation. + +
+
+
+
+
+ + ♻ ☆ Droplets of Good Representations: Grokking as a First Order Phase + Transition in Two Layer Networks + + +
+ A key property of deep neural networks (DNNs) is their ability to learn new +features during training. This intriguing aspect of deep learning stands out +most clearly in recently reported Grokking phenomena. While mainly reflected as +a sudden increase in test accuracy, Grokking is also believed to be a beyond +lazy-learning/Gaussian Process (GP) phenomenon involving feature learning. Here +we apply a recent development in the theory of feature learning, the adaptive +kernel approach, to two teacher-student models with cubic-polynomial and +modular addition teachers. We provide analytical predictions on feature +learning and Grokking properties of these models and demonstrate a mapping +between Grokking and the theory of phase transitions. We show that after +Grokking, the state of the DNN is analogous to the mixed phase following a +first-order phase transition. In this mixed phase, the DNN generates useful +internal representations of the teacher that are sharply distinct from those +before the transition. + +
+
+
+
+
+ + ♻ ☆ Efficient Vision Transformer for Human Pose Estimation via Patch + Selection BMVC 2023 + + +
+ While Convolutional Neural Networks (CNNs) have been widely successful in 2D +human pose estimation, Vision Transformers (ViTs) have emerged as a promising +alternative to CNNs, boosting state-of-the-art performance. However, the +quadratic computational complexity of ViTs has limited their applicability for +processing high-resolution images. In this paper, we propose three methods for +reducing ViT's computational complexity, which are based on selecting and +processing a small number of most informative patches while disregarding +others. The first two methods leverage a lightweight pose estimation network to +guide the patch selection process, while the third method utilizes a set of +learnable joint tokens to ensure that the selected patches contain the most +important information about body joints. Experiments across six benchmarks show +that our proposed methods achieve a significant reduction in computational +complexity, ranging from 30% to 44%, with only a minimal drop in accuracy +between 0% and 3.5%. + +
+
+ comment: BMVC 2023 Oral Paper: https://proceedings.bmvc2023.org/167/ +
+
+
+
+
+ + ♻ ☆ Predict, Refine, Synthesize: Self-Guiding Diffusion Models for + Probabilistic Time Series Forecasting + + +
+ Diffusion models have achieved state-of-the-art performance in generative +modeling tasks across various domains. Prior works on time series diffusion +models have primarily focused on developing conditional models tailored to +specific forecasting or imputation tasks. In this work, we explore the +potential of task-agnostic, unconditional diffusion models for several time +series applications. We propose TSDiff, an unconditionally-trained diffusion +model for time series. Our proposed self-guidance mechanism enables +conditioning TSDiff for downstream tasks during inference, without requiring +auxiliary networks or altering the training procedure. We demonstrate the +effectiveness of our method on three different time series tasks: forecasting, +refinement, and synthetic data generation. First, we show that TSDiff is +competitive with several task-specific conditional forecasting methods +(predict). Second, we leverage the learned implicit probability density of +TSDiff to iteratively refine the predictions of base forecasters with reduced +computational overhead over reverse diffusion (refine). Notably, the generative +performance of the model remains intact -- downstream forecasters trained on +synthetic samples from TSDiff outperform forecasters that are trained on +samples from other state-of-the-art generative time series models, occasionally +even outperforming models trained on real data (synthesize). + +
+
+ comment: Code available at + https://github.com/amazon-science/unconditional-time-series-diffusion +
+
+
+
+
+ + ♻ ☆ Looking at the posterior: accuracy and uncertainty of neural-network + predictions + + +
+ Bayesian inference can quantify uncertainty in the predictions of neural +networks using posterior distributions for model parameters and network output. +By looking at these posterior distributions, one can separate the origin of +uncertainty into aleatoric and epistemic contributions. One goal of uncertainty +quantification is to inform on prediction accuracy. Here we show that +prediction accuracy depends on both epistemic and aleatoric uncertainty in an +intricate fashion that cannot be understood in terms of marginalized +uncertainty distributions alone. How the accuracy relates to epistemic and +aleatoric uncertainties depends not only on the model architecture, but also on +the properties of the dataset. We discuss the significance of these results for +active learning and introduce a novel acquisition function that outperforms +common uncertainty-based methods. To arrive at our results, we approximated the +posteriors using deep ensembles, for fully-connected, convolutional and +attention-based neural networks. + +
+
+ comment: 26 pages, 10 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Counterfactual Explanation for Regression via Disentanglement in Latent + Space ICDM 2023 + + +
+ Counterfactual Explanations (CEs) help address the question: How can the +factors that influence the prediction of a predictive model be changed to +achieve a more favorable outcome from a user's perspective? Thus, they bear the +potential to guide the user's interaction with AI systems since they represent +easy-to-understand explanations. To be applicable, CEs need to be realistic and +actionable. In the literature, various methods have been proposed to generate +CEs. However, the majority of research on CEs focuses on classification +problems where questions like "What should I do to get my rejected loan +approved?" are raised. In practice, answering questions like "What should I do +to increase my salary?" are of a more regressive nature. In this paper, we +introduce a novel method to generate CEs for a pre-trained regressor by first +disentangling the label-relevant from the label-irrelevant dimensions in the +latent space. CEs are then generated by combining the label-irrelevant +dimensions and the predefined output. The intuition behind this approach is +that the ideal counterfactual search should focus on the label-irrelevant +characteristics of the input and suggest changes toward target-relevant +characteristics. Searching in the latent space could help achieve this goal. We +show that our method maintains the characteristics of the query sample during +the counterfactual search. In various experiments, we demonstrate that the +proposed method is competitive based on different quality measures on image and +tabular datasets in regression problem settings. It efficiently returns results +closer to the original data manifold compared to three state-of-the-art +methods, which is essential for realistic high-dimensional machine learning +applications. Our code will be made available as an open-source package upon +the publication of this work. + +
+
+ comment: CXAI workshop @ ICDM 2023. arXiv admin note: text overlap with + arXiv:2307.13390 +
+
+
+
+
+ + ♻ ☆ pSTarC: Pseudo Source Guided Target Clustering for Fully Test-Time + Adaptation WACV 2024 + + +
+ Test Time Adaptation (TTA) is a pivotal concept in machine learning, enabling +models to perform well in real-world scenarios, where test data distribution +differs from training. In this work, we propose a novel approach called pseudo +Source guided Target Clustering (pSTarC) addressing the relatively unexplored +area of TTA under real-world domain shifts. This method draws inspiration from +target clustering techniques and exploits the source classifier for generating +pseudo-source samples. The test samples are strategically aligned with these +pseudo-source samples, facilitating their clustering and thereby enhancing TTA +performance. pSTarC operates solely within the fully test-time adaptation +protocol, removing the need for actual source data. Experimental validation on +a variety of domain shift datasets, namely VisDA, Office-Home, DomainNet-126, +CIFAR-100C verifies pSTarC's effectiveness. This method exhibits significant +improvements in prediction accuracy along with efficient computational +requirements. Furthermore, we also demonstrate the universality of the pSTarC +framework by showing its effectiveness for the continuous TTA framework. The +source code for our method is available at https://manogna-s.github.io/pstarc + +
+
+ comment: Accepted in WACV 2024 +
+
+
+
+
+ + ♻ ☆ Goal Space Abstraction in Hierarchical Reinforcement Learning via + Set-Based Reachability Analysis + + +
+ Open-ended learning benefits immensely from the use of symbolic methods for +goal representation as they offer ways to structure knowledge for efficient and +transferable learning. However, the existing Hierarchical Reinforcement +Learning (HRL) approaches relying on symbolic reasoning are often limited as +they require a manual goal representation. The challenge in autonomously +discovering a symbolic goal representation is that it must preserve critical +information, such as the environment dynamics. In this paper, we propose a +developmental mechanism for goal discovery via an emergent representation that +abstracts (i.e., groups together) sets of environment states that have similar +roles in the task. We introduce a Feudal HRL algorithm that concurrently learns +both the goal representation and a hierarchical policy. The algorithm uses +symbolic reachability analysis for neural networks to approximate the +transition relation among sets of states and to refine the goal representation. +We evaluate our approach on complex navigation tasks, showing the learned +representation is interpretable, transferrable and results in data efficient +learning. + +
+
+
+
+
+ + ♻ ☆ Exploring Practitioner Perspectives On Training Data Attribution + Explanations NeurIPS + + +
+ Explainable AI (XAI) aims to provide insight into opaque model reasoning to +humans and as such is an interdisciplinary field by nature. In this paper, we +interviewed 10 practitioners to understand the possible usability of training +data attribution (TDA) explanations and to explore the design space of such an +approach. We confirmed that training data quality is often the most important +factor for high model performance in practice and model developers mainly rely +on their own experience to curate data. End-users expect explanations to +enhance their interaction with the model and do not necessarily prioritise but +are open to training data as a means of explanation. Within our participants, +we found that TDA explanations are not well-known and therefore not used. We +urge the community to focus on the utility of TDA techniques from the +human-machine collaboration perspective and broaden the TDA evaluation to +reflect common use cases in practice. + +
+
+ comment: Accepted to NeurIPS XAI in Action workshop 2023 +
+
+
+
+
+ + ♻ ☆ Explainable Anomaly Detection using Masked Latent Generative Modeling + + +
+ We present a novel time series anomaly detection method that achieves +excellent detection accuracy while offering a superior level of explainability. +Our proposed method, TimeVQVAE-AD, leverages masked generative modeling adapted +from the cutting-edge time series generation method known as TimeVQVAE. The +prior model is trained on the discrete latent space of a time-frequency domain. +Notably, the dimensional semantics of the time-frequency domain are preserved +in the latent space, enabling us to compute anomaly scores across different +frequency bands, which provides a better insight into the detected anomalies. +Additionally, the generative nature of the prior model allows for sampling +likely normal states for detected anomalies, enhancing the explainability of +the detected anomalies through counterfactuals. Our experimental evaluation on +the UCR Time Series Anomaly archive demonstrates that TimeVQVAE-AD +significantly surpasses the existing methods in terms of detection accuracy and +explainability. + +
+
+
+
+
+ + ♻ ☆ Neural Network Pruning by Gradient Descent + + +
+ The rapid increase in the parameters of deep learning models has led to +significant costs, challenging computational efficiency and model +interpretability. In this paper, we introduce a novel and straightforward +neural network pruning framework that incorporates the Gumbel-Softmax +technique. This framework enables the simultaneous optimization of a network's +weights and topology in an end-to-end process using stochastic gradient +descent. Empirical results demonstrate its exceptional compression capability, +maintaining high accuracy on the MNIST dataset with only 0.15\% of the original +network parameters. Moreover, our framework enhances neural network +interpretability, not only by allowing easy extraction of feature importance +directly from the pruned network but also by enabling visualization of feature +symmetry and the pathways of information propagation from features to outcomes. +Although the pruning strategy is learned through deep learning, it is +surprisingly intuitive and understandable, focusing on selecting key +representative features and exploiting data patterns to achieve extreme sparse +pruning. We believe our method opens a promising new avenue for deep learning +pruning and the creation of interpretable machine learning systems. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Byzantine-Robust Learning on Heterogeneous Datasets via Bucketing ICLR 2022 + + +
+ In Byzantine robust distributed or federated learning, a central server wants +to train a machine learning model over data distributed across multiple +workers. However, a fraction of these workers may deviate from the prescribed +algorithm and send arbitrary messages. While this problem has received +significant attention recently, most current defenses assume that the workers +have identical data. For realistic cases when the data across workers are +heterogeneous (non-iid), we design new attacks which circumvent current +defenses, leading to significant loss of performance. We then propose a simple +bucketing scheme that adapts existing robust algorithms to heterogeneous +datasets at a negligible computational cost. We also theoretically and +experimentally validate our approach, showing that combining bucketing with +existing robust algorithms is effective against challenging attacks. Our work +is the first to establish guaranteed convergence for the non-iid Byzantine +robust problem under realistic assumptions. + +
+
+ comment: v5 is the camera-ready version of this paper on ICLR 2022 +
+
+
+
+
+ + ♻ ☆ On sampling determinantal and Pfaffian point processes on a quantum + computer + + +
+ DPPs were introduced by Macchi as a model in quantum optics the 1970s. Since +then, they have been widely used as models and subsampling tools in statistics +and computer science. Most applications require sampling from a DPP, and given +their quantum origin, it is natural to wonder whether sampling a DPP on a +quantum computer is easier than on a classical one. We focus here on DPPs over +a finite state space, which are distributions over the subsets of +$\{1,\dots,N\}$ parametrized by an $N\times N$ Hermitian kernel matrix. Vanilla +sampling consists in two steps, of respective costs $\mathcal{O}(N^3)$ and +$\mathcal{O}(Nr^2)$ operations on a classical computer, where $r$ is the rank +of the kernel matrix. A large first part of the current paper consists in +explaining why the state-of-the-art in quantum simulation of fermionic systems +already yields quantum DPP sampling algorithms. We then modify existing quantum +circuits, and discuss their insertion in a full DPP sampling pipeline that +starts from practical kernel specifications. The bottom line is that, with $P$ +(classical) parallel processors, we can divide the preprocessing cost by $P$ +and build a quantum circuit with $\mathcal{O}(Nr)$ gates that sample a given +DPP, with depth varying from $\mathcal{O}(N)$ to $\mathcal{O}(r\log N)$ +depending on qubit-communication constraints on the target machine. We also +connect existing work on the simulation of superconductors to Pfaffian point +processes, which generalize DPPs and would be a natural addition to the machine +learner's toolbox. In particular, we describe "projective" Pfaffian point +processes, the cardinality of which has constant parity, almost surely. +Finally, the circuits are empirically validated on a classical simulator and on +5-qubit IBM machines. + +
+
+ comment: 53 pages, 9 figures. Additional results about parity of cardinality + of PfPP samples. Minor corrections in Section 5 and slight generalization of + Lemma 5.4. Extra example and derivations in appendix +
+
+
+
+
+ + ♻ ☆ FedSN: A General Federated Learning Framework over LEO Satellite + Networks + + +
+ Recently, a large number of Low Earth Orbit (LEO) satellites have been +launched and deployed successfully in space by commercial companies, such as +SpaceX. Due to multimodal sensors equipped by the LEO satellites, they serve +not only for communication but also for various machine learning applications, +such as space modulation recognition, remote sensing image classification, etc. +However, the ground station (GS) may be incapable of downloading such a large +volume of raw sensing data for centralized model training due to the limited +contact time with LEO satellites (e.g. 5 minutes). Therefore, federated +learning (FL) has emerged as the promising solution to address this problem via +on-device training. Unfortunately, to enable FL on LEO satellites, we still +face three critical challenges that are i) heterogeneous computing and memory +capabilities, ii) limited uplink rate, and iii) model staleness. To this end, +we propose FedSN as a general FL framework to tackle the above challenges, and +fully explore data diversity on LEO satellites. Specifically, we first present +a novel sub-structure scheme to enable heterogeneous local model training +considering different computing, memory, and communication constraints on LEO +satellites. Additionally, we propose a pseudo-synchronous model aggregation +strategy to dynamically schedule model aggregation for compensating model +staleness. To further demonstrate the effectiveness of the FedSN, we evaluate +it using space modulation recognition and remote sensing image classification +tasks by leveraging the data from real-world satellite networks. Extensive +experimental results demonstrate that FedSN framework achieves higher accuracy, +lower computing, and communication overhead than the state-of-the-art +benchmarks and the effectiveness of each components in FedSN. + +
+
+ comment: 14 pages, 17 figures +
+
+
+
+
+ + ♻ ☆ In-Context Learning Functions with Varying Number of Minima + + +
+ Large Language Models (LLMs) have proven effective at In-Context Learning +(ICL), an ability that allows them to create predictors from labeled examples. +Few studies have explored the interplay between ICL and specific properties of +functions it attempts to approximate. In our study, we use a formal framework +to explore ICL and propose a new task of approximating functions with varying +number of minima. We implement a method that allows for producing functions +with given inputs as minima. We find that increasing the number of minima +degrades ICL performance. At the same time, our evaluation shows that ICL +outperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster +than 2NN in all settings. We validate the findings through a set of few-shot +experiments across various hyperparameter configurations. + +
+
+
+
+
+ + ♻ ☆ Degree-Preserving Randomized Response for Graph Neural Networks under + Local Differential Privacy + + +
+ Differentially private GNNs (Graph Neural Networks) have been recently +studied to provide high accuracy in various tasks on graph data while strongly +protecting user privacy. In particular, a recent study proposes an algorithm to +protect each user's feature vector in an attributed graph with LDP (Local +Differential Privacy), a strong privacy notion without a trusted third party. +However, this algorithm does not protect edges (friendships) in a social graph, +hence cannot protect user privacy in unattributed graphs. How to provide strong +privacy with high accuracy in unattributed graphs remains open. + In this paper, we propose a novel LDP algorithm called the DPRR +(Degree-Preserving Randomized Response) to provide LDP for edges in GNNs. Our +DPRR preserves each user's degree hence a graph structure while providing edge +LDP. Technically, our DPRR uses Warner's RR (Randomized Response) and strategic +edge sampling, where each user's sampling probability is automatically tuned +using the Laplacian mechanism to preserve the degree information under edge +LDP. We also propose a privacy budget allocation method to make the noise in +both Warner's RR and the Laplacian mechanism small. We focus on graph +classification as a task of GNNs and evaluate the DPRR using three social graph +datasets. Our experimental results show that the DPRR significantly outperforms +three baselines and provides accuracy close to a non-private algorithm in all +datasets with a reasonable privacy budget, e.g., epsilon=1. + +
+
+
+
+
+ + ♻ ☆ ProlificDreamer: High-Fidelity and Diverse Text-to-3D Generation with + Variational Score Distillation NeurIPS 2023 + + +
+ Score distillation sampling (SDS) has shown great promise in text-to-3D +generation by distilling pretrained large-scale text-to-image diffusion models, +but suffers from over-saturation, over-smoothing, and low-diversity problems. +In this work, we propose to model the 3D parameter as a random variable instead +of a constant as in SDS and present variational score distillation (VSD), a +principled particle-based variational framework to explain and address the +aforementioned issues in text-to-3D generation. We show that SDS is a special +case of VSD and leads to poor samples with both small and large CFG weights. In +comparison, VSD works well with various CFG weights as ancestral sampling from +diffusion models and simultaneously improves the diversity and sample quality +with a common CFG weight (i.e., $7.5$). We further present various improvements +in the design space for text-to-3D such as distillation time schedule and +density initialization, which are orthogonal to the distillation algorithm yet +not well explored. Our overall approach, dubbed ProlificDreamer, can generate +high rendering resolution (i.e., $512\times512$) and high-fidelity NeRF with +rich structure and complex effects (e.g., smoke and drops). Further, +initialized from NeRF, meshes fine-tuned by VSD are meticulously detailed and +photo-realistic. Project page and codes: +https://ml.cs.tsinghua.edu.cn/prolificdreamer/ + +
+
+ comment: NeurIPS 2023 (Spotlight) +
+
+
+
+
+ + ♻ ☆ Patch-Mix Contrastive Learning with Audio Spectrogram Transformer on + Respiratory Sound Classification INTERSPEECH 2023 + + +
+ Respiratory sound contains crucial information for the early diagnosis of +fatal lung diseases. Since the COVID-19 pandemic, there has been a growing +interest in contact-free medical care based on electronic stethoscopes. To this +end, cutting-edge deep learning models have been developed to diagnose lung +diseases; however, it is still challenging due to the scarcity of medical data. +In this study, we demonstrate that the pretrained model on large-scale visual +and audio datasets can be generalized to the respiratory sound classification +task. In addition, we introduce a straightforward Patch-Mix augmentation, which +randomly mixes patches between different samples, with Audio Spectrogram +Transformer (AST). We further propose a novel and effective Patch-Mix +Contrastive Learning to distinguish the mixed representations in the latent +space. Our method achieves state-of-the-art performance on the ICBHI dataset, +outperforming the prior leading score by an improvement of 4.08%. + +
+
+ comment: INTERSPEECH 2023, Code URL: + https://github.com/raymin0223/patch-mix_contrastive_learning +
+
+
+
+
+ + ♻ ☆ PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics + + +
+ We introduce PhysGaussian, a new method that seamlessly integrates physically +grounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel +motion synthesis. Employing a custom Material Point Method (MPM), our approach +enriches 3D Gaussian kernels with physically meaningful kinematic deformation +and mechanical stress attributes, all evolved in line with continuum mechanics +principles. A defining characteristic of our method is the seamless integration +between physical simulation and visual rendering: both components utilize the +same 3D Gaussian kernels as their discrete representations. This negates the +necessity for triangle/tetrahedron meshing, marching cubes, "cage meshes," or +any other geometry embedding, highlighting the principle of "what you see is +what you simulate (WS$^2$)." Our method demonstrates exceptional versatility +across a wide variety of materials--including elastic entities, metals, +non-Newtonian fluids, and granular materials--showcasing its strong +capabilities in creating diverse visual content with novel viewpoints and +movements. Our project page is at: https://xpandora.github.io/PhysGaussian/ + +
+
+
+
+
+ + ♻ ☆ EdgeFM: Leveraging Foundation Model for Open-set Learning on the Edge + + +
+ Deep Learning (DL) models have been widely deployed on IoT devices with the +help of advancements in DL algorithms and chips. However, the limited resources +of edge devices make these on-device DL models hard to be generalizable to +diverse environments and tasks. Although the recently emerged foundation models +(FMs) show impressive generalization power, how to effectively leverage the +rich knowledge of FMs on resource-limited edge devices is still not explored. +In this paper, we propose EdgeFM, a novel edge-cloud cooperative system with +open-set recognition capability. EdgeFM selectively uploads unlabeled data to +query the FM on the cloud and customizes the specific knowledge and +architectures for edge models. Meanwhile, EdgeFM conducts dynamic model +switching at run-time taking into account both data uncertainty and dynamic +network variations, which ensures the accuracy always close to the original FM. +We implement EdgeFM using two FMs on two edge platforms. We evaluate EdgeFM on +three public datasets and two self-collected datasets. Results show that EdgeFM +can reduce the end-to-end latency up to 3.2x and achieve 34.3% accuracy +increase compared with the baseline. + +
+
+ comment: Accepted to the 21th ACM Conference on Embedded Networked Sensor + Systems (SenSys 2023) +
+
+
+
+
+ + ♻ ☆ Sensor Fault Detection and Isolation in Autonomous Nonlinear Systems + Using Neural Network-Based Observers + + +
+ This paper presents a novel observer-based approach to detect and isolate +faulty sensors in nonlinear systems. The proposed sensor fault detection and +isolation (s-FDI) method applies to a general class of nonlinear systems. Our +focus is on s-FDI for two types of faults: complete failure and sensor +degradation. The key aspect of this approach lies in the utilization of a +neural network-based Kazantzis-Kravaris/Luenberger (KKL) observer. The neural +network is trained to learn the dynamics of the observer, enabling accurate +output predictions of the system. Sensor faults are detected by comparing the +actual output measurements with the predicted values. If the difference +surpasses a theoretical threshold, a sensor fault is detected. To identify and +isolate which sensor is faulty, we compare the numerical difference of each +sensor meassurement with an empirically derived threshold. We derive both +theoretical and empirical thresholds for detection and isolation, respectively. +Notably, the proposed approach is robust to measurement noise and system +uncertainties. Its effectiveness is demonstrated through numerical simulations +of sensor faults in a network of Kuramoto oscillators. + +
+
+
+
+
+ + ♻ ☆ LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene + Graphs with Weak Supervision + + +
+ We propose LASER, a neuro-symbolic approach to learn semantic video +representations that capture rich spatial and temporal properties in video data +by leveraging high-level logic specifications. In particular, we formulate the +problem in terms of alignment between raw videos and spatio-temporal logic +specifications. The alignment algorithm leverages a differentiable symbolic +reasoner and a combination of contrastive, temporal, and semantics losses. It +effectively and efficiently trains low-level perception models to extract +fine-grained video representation in the form of a spatio-temporal scene graph +that conforms to the desired high-level specification. In doing so, we explore +a novel methodology that weakly supervises the learning of video semantic +representations through logic specifications. We evaluate our method on two +datasets with rich spatial and temporal specifications: +20BN-Something-Something and MUGEN. We demonstrate that our method learns +better fine-grained video semantics than existing baselines. + +
+
+
+
+
+ + ♻ ☆ Self supervised convolutional kernel based handcrafted feature + harmonization: Enhanced left ventricle hypertension disease phenotyping on + echocardiography + + +
+ Radiomics, a medical imaging technique, extracts quantitative handcrafted +features from images to predict diseases. Harmonization in those features +ensures consistent feature extraction across various imaging devices and +protocols. Methods for harmonization include standardized imaging protocols, +statistical adjustments, and evaluating feature robustness. Myocardial diseases +such as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD) +are diagnosed via echocardiography, but variable imaging settings pose +challenges. Harmonization techniques are crucial for applying handcrafted +features in disease diagnosis in such scenario. Self-supervised learning (SSL) +enhances data understanding within limited datasets and adapts to diverse data +settings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying +superior performance in various tasks. This study focuses on convolutional +filters within SSL, using them as preprocessing to convert images into feature +maps for handcrafted feature harmonization. Our proposed method excelled in +harmonization evaluation and exhibited superior LVH classification performance +compared to existing methods. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Learning continuous models for continuous physics + + +
+ Dynamical systems that evolve continuously over time are ubiquitous +throughout science and engineering. Machine learning (ML) provides data-driven +approaches to model and predict the dynamics of such systems. A core issue with +this approach is that ML models are typically trained on discrete data, using +ML methodologies that are not aware of underlying continuity properties. This +results in models that often do not capture any underlying continuous dynamics +-- either of the system of interest, or indeed of any related system. To +address this challenge, we develop a convergence test based on numerical +analysis theory. Our test verifies whether a model has learned a function that +accurately approximates an underlying continuous dynamics. Models that fail +this test fail to capture relevant dynamics, rendering them of limited utility +for many scientific prediction tasks; while models that pass this test enable +both better interpolation and better extrapolation in multiple ways. Our +results illustrate how principled numerical analysis methods can be coupled +with existing ML training/testing methodologies to validate models for science +and engineering applications. + +
+
+ comment: 39 pages +
+
+
+
+
+ + ♻ ☆ On the Tradeoff between Privacy Preservation and Byzantine-Robustness in + Decentralized Learning + + +
+ This paper jointly considers privacy preservation and Byzantine-robustness in +decentralized learning. In a decentralized network, honest-but-curious agents +faithfully follow the prescribed algorithm, but expect to infer their +neighbors' private data from messages received during the learning process, +while dishonest-and-Byzantine agents disobey the prescribed algorithm, and +deliberately disseminate wrong messages to their neighbors so as to bias the +learning process. For this novel setting, we investigate a generic +privacy-preserving and Byzantine-robust decentralized stochastic gradient +descent (SGD) framework, in which Gaussian noise is injected to preserve +privacy and robust aggregation rules are adopted to counteract Byzantine +attacks. We analyze its learning error and privacy guarantee, discovering an +essential tradeoff between privacy preservation and Byzantine-robustness in +decentralized learning -- the learning error caused by defending against +Byzantine attacks is exacerbated by the Gaussian noise added to preserve +privacy. For a class of state-of-the-art robust aggregation rules, we give +unified analysis of the "mixing abilities". Building upon this analysis, we +reveal how the "mixing abilities" affect the tradeoff between privacy +preservation and Byzantine-robustness. The theoretical results provide +guidelines for achieving a favorable tradeoff with proper design of robust +aggregation rules. Numerical experiments are conducted and corroborate our +theoretical findings. + +
+
+
+
+
+ + ♻ ☆ BLIVA: A Simple Multimodal LLM for Better Handling of Text-Rich Visual + Questions + + +
+ Vision Language Models (VLMs), which extend Large Language Models (LLM) by +incorporating visual understanding capability, have demonstrated significant +advancements in addressing open-ended visual question-answering (VQA) tasks. +However, these models cannot accurately interpret images infused with text, a +common occurrence in real-world scenarios. Standard procedures for extracting +information from images often involve learning a fixed set of query embeddings. +These embeddings are designed to encapsulate image contexts and are later used +as soft prompt inputs in LLMs. Yet, this process is limited to the token count, +potentially curtailing the recognition of scenes with text-rich context. To +improve upon them, the present study introduces BLIVA: an augmented version of +InstructBLIP with Visual Assistant. BLIVA incorporates the query embeddings +from InstructBLIP and also directly projects encoded patch embeddings into the +LLM, a technique inspired by LLaVA. This approach assists the model to capture +intricate details potentially missed during the query decoding process. +Empirical evidence demonstrates that our model, BLIVA, significantly enhances +performance in processing text-rich VQA benchmarks (up to 17.76% in OCR-VQA +benchmark) and in undertaking general (not particularly text-rich) VQA +benchmarks (up to 7.9% in Visual Spatial Reasoning benchmark), comparing to our +baseline InstructBLIP. BLIVA demonstrates significant capability in decoding +real-world images, irrespective of text presence. To demonstrate the broad +industry applications enabled by BLIVA, we evaluate the model using a new +dataset comprising YouTube thumbnails paired with question-answer sets across +11 diverse categories. For researchers interested in further exploration, our +code and models are freely accessible at https://github.com/mlpc-ucsd/BLIVA. + +
+
+
+
+
+ + ♻ ☆ Enhancing Novel Object Detection via Cooperative Foundational Models + + +
+ In this work, we address the challenging and emergent problem of novel object +detection (NOD), focusing on the accurate detection of both known and novel +object categories during inference. Traditional object detection algorithms are +inherently closed-set, limiting their capability to handle NOD. We present a +novel approach to transform existing closed-set detectors into open-set +detectors. This transformation is achieved by leveraging the complementary +strengths of pre-trained foundational models, specifically CLIP and SAM, +through our cooperative mechanism. Furthermore, by integrating this mechanism +with state-of-the-art open-set detectors such as GDINO, we establish new +benchmarks in object detection performance. Our method achieves 17.42 mAP in +novel object detection and 42.08 mAP for known objects on the challenging LVIS +dataset. Adapting our approach to the COCO OVD split, we surpass the current +state-of-the-art by a margin of 7.2 $ \text{AP}_{50} $ for novel classes. Our +code is available at +https://github.com/rohit901/cooperative-foundational-models . + +
+
+ comment: Code: https://github.com/rohit901/cooperative-foundational-models +
+
+
+
+
+ + ♻ ☆ Variational Connectionist Temporal Classification for Order-Preserving + Sequence Modeling + + +
+ Connectionist temporal classification (CTC) is commonly adopted for sequence +modeling tasks like speech recognition, where it is necessary to preserve order +between the input and target sequences. However, CTC is only applied to +deterministic sequence models, where the latent space is discontinuous and +sparse, which in turn makes them less capable of handling data variability when +compared to variational models. In this paper, we integrate CTC with a +variational model and derive loss functions that can be used to train more +generalizable sequence models that preserve order. Specifically, we derive two +versions of the novel variational CTC based on two reasonable assumptions, the +first being that the variational latent variables at each time step are +conditionally independent; and the second being that these latent variables are +Markovian. We show that both loss functions allow direct optimization of the +variational lower bound for the model log-likelihood, and present +computationally tractable forms for implementing them. + +
+
+
+
+
+ + ♻ ☆ Discovering stochastic dynamical equations from biological time series + data + + +
+ Stochastic differential equations (SDEs) are an important framework to model +dynamics with randomness, as is common in most biological systems. The inverse +problem of integrating these models with empirical data remains a major +challenge. Here, we present a software package, PyDaDDy (Python Library for +Data Driven Dynamics) that takes time series data as an input and outputs an +interpretable SDE. We achieve this by combining traditional approaches from +stochastic calculus literature with state-of-the-art equation discovery +techniques. We validate our approach on synthetic datasets, and demonstrate the +generality and applicability of the method on two real-world datasets of vastly +different spatiotemporal scales: (i) collective movement of fish school where +stochasticity plays a crucial role, and (ii) confined migration of a single +cell, primarily following a relaxed oscillation. We make the method available +as an easy-to-use, open-source Python package, PyDaddy (Python Library for Data +Driven Dynamics). + +
+
+ comment: 15 pages (+ 9 page appendix), 6 figures (+ 8 appendix figures). + Updates: v3: Significantly reorganized the paper and added a section analysis + of a cell migration dataset. v4: Update arXiv title to match the updated + title of the manuscript +
+
+
+
+
+ + ♻ ☆ Differentially Private Wireless Federated Learning Using Orthogonal + Sequences + + +
+ We propose a privacy-preserving uplink over-the-air computation (AirComp) +method, termed FLORAS, for single-input single-output (SISO) wireless federated +learning (FL) systems. From the perspective of communication designs, FLORAS +eliminates the requirement of channel state information at the transmitters +(CSIT) by leveraging the properties of orthogonal sequences. From the privacy +perspective, we prove that FLORAS offers both item-level and client-level +differential privacy (DP) guarantees. Moreover, by properly adjusting the +system parameters, FLORAS can flexibly achieve different DP levels at no +additional cost. A new FL convergence bound is derived which, combined with the +privacy guarantees, allows for a smooth tradeoff between the achieved +convergence rate and differential privacy levels. Experimental results +demonstrate the advantages of FLORAS compared with the baseline AirComp method, +and validate that the analytical results can guide the design of +privacy-preserving FL with different tradeoff requirements on the model +convergence and privacy levels. + +
+
+ comment: 33 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ ShaDDR: Interactive Example-Based Geometry and Texture Generation via 3D + Shape Detailization and Differentiable Rendering SIGGRAPH + + +
+ We present ShaDDR, an example-based deep generative neural network which +produces a high-resolution textured 3D shape through geometry detailization and +conditional texture generation applied to an input coarse voxel shape. Trained +on a small set of detailed and textured exemplar shapes, our method learns to +detailize the geometry via multi-resolution voxel upsampling and generate +textures on voxel surfaces via differentiable rendering against exemplar +texture images from a few views. The generation is interactive, taking less +than 1 second to produce a 3D model with voxel resolutions up to 512^3. The +generated shape preserves the overall structure of the input coarse voxel +model, while the style of the generated geometric details and textures can be +manipulated through learned latent codes. In the experiments, we show that our +method can generate higher-resolution shapes with plausible and improved +geometric details and clean textures compared to prior works. Furthermore, we +showcase the ability of our method to learn geometric details and textures from +shapes reconstructed from real-world photos. In addition, we have developed an +interactive modeling application to demonstrate the generalizability of our +method to various user inputs and the controllability it offers, allowing users +to interactively sculpt a coarse voxel shape to define the overall structure of +the detailized 3D shape. Code and data are available at +https://github.com/qiminchen/ShaDDR. + +
+
+ comment: Accepted to SIGGRAPH Asia 2023 conference track. Code: + https://github.com/qiminchen/ShaDDR +
+
+
+
+
+ + ♻ ☆ Verified Compositional Neuro-Symbolic Control for Stochastic Systems + with Temporal Logic Tasks + + +
+ Several methods have been proposed recently to learn neural network (NN) +controllers for autonomous agents, with unknown and stochastic dynamics, tasked +with complex missions captured by Linear Temporal Logic (LTL). Due to the +sample-inefficiency of the majority of these works, compositional learning +methods have been proposed decomposing the LTL specification into smaller +sub-tasks. Then, separate controllers are learned and composed to satisfy the +original task. A key challenge within these approaches is that they often lack +safety guarantees or the provided guarantees are impractical. This paper aims +to address this challenge. Particularly, we consider autonomous systems with +unknown and stochastic dynamics and LTL-encoded tasks. We assume that the +system is equipped with a finite set of base skills modeled by trained NN +feedback controllers. Our goal is to check if there exists a temporal +composition of the trained NN controllers - and if so, to compute it - that +will yield a composite system behavior that satisfies the assigned LTL task +with probability one. We propose a new approach that relies on a novel +integration of automata theory and data-driven reachability analysis tools for +NN-controlled stochastic systems. The resulting neuro-symbolic controller +allows the agent to generate safe behaviors for unseen complex temporal logic +tasks in a zero-shot fashion by leveraging its base skills. We show correctness +of the proposed method and we provide conditions under which it is complete. To +the best of our knowledge, this is the first work that designs verified +temporal compositions of NN controllers for unknown and stochastic systems. +Finally, we provide extensive numerical simulations and hardware experiments on +robot navigation tasks to demonstrate the proposed method. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2209.06130 +
+
+
+
+
+ + ♻ ☆ Bridging the Gap Between Offline and Online Reinforcement Learning + Evaluation Methodologies + + +
+ Reinforcement learning (RL) has shown great promise with algorithms learning +in environments with large state and action spaces purely from scalar reward +signals. A crucial challenge for current deep RL algorithms is that they +require a tremendous amount of environment interactions for learning. This can +be infeasible in situations where such interactions are expensive; such as in +robotics. Offline RL algorithms try to address this issue by bootstrapping the +learning process from existing logged data without needing to interact with the +environment from the very beginning. While online RL algorithms are typically +evaluated as a function of the number of environment interactions, there exists +no single established protocol for evaluating offline RL methods.In this paper, +we propose a sequential approach to evaluate offline RL algorithms as a +function of the training set size and thus by their data efficiency. Sequential +evaluation provides valuable insights into the data efficiency of the learning +process and the robustness of algorithms to distribution changes in the dataset +while also harmonizing the visualization of the offline and online learning +phases. Our approach is generally applicable and easy to implement. We compare +several existing offline RL algorithms using this approach and present insights +from a variety of tasks and offline datasets. + +
+
+ comment: TMLR 2023 +
+
+
+
+
+ + ♻ ☆ Channel and Gradient-Importance Aware Device Scheduling for Over-the-Air + Federated Learning + + +
+ Federated learning (FL) is a popular privacy-preserving distributed training +scheme, where multiple devices collaborate to train machine learning models by +uploading local model updates. To improve communication efficiency, +over-the-air computation (AirComp) has been applied to FL, which leverages +analog modulation to harness the superposition property of radio waves such +that numerous devices can upload their model updates concurrently for +aggregation. However, the uplink channel noise incurs considerable model +aggregation distortion, which is critically determined by the device scheduling +and compromises the learned model performance. In this paper, we propose a +probabilistic device scheduling framework for over-the-air FL, named PO-FL, to +mitigate the negative impact of channel noise, where each device is scheduled +according to a certain probability and its model update is reweighted using +this probability in aggregation. We prove the unbiasedness of this aggregation +scheme and demonstrate the convergence of PO-FL on both convex and non-convex +loss functions. Our convergence bounds unveil that the device scheduling +affects the learning performance through the communication distortion and +global update variance. Based on the convergence analysis, we further develop a +channel and gradient-importance aware algorithm to optimize the device +scheduling probabilities in PO-FL. Extensive simulation results show that the +proposed PO-FL framework with channel and gradient-importance awareness +achieves faster convergence and produces better models than baseline methods. + +
+
+
+
+
+ + ♻ ☆ Creating Temporally Correlated High-Resolution Power Injection Profiles + Using Physics-Aware GAN + + +
+ Traditional smart meter measurements lack the granularity needed for +real-time decision-making. To address this practical problem, we create a +generative adversarial networks (GAN) model that enforces temporal consistency +on its high-resolution outputs via hard inequality constraints using a convex +optimization layer. A unique feature of our GAN model is that it is trained +solely on slow timescale aggregated power information obtained from historical +smart meter data. The results demonstrate that the model can successfully +create minutely interval temporally-correlated instantaneous power injection +profiles from 15-minute average power consumption information. This innovative +approach, emphasizing inter-neuron constraints, offers a promising avenue for +improved high-speed state estimation in distribution systems and enhances the +applicability of data-driven solutions for monitoring such systems. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ♻ ☆ On the Representational Capacity of Recurrent Neural Language Models EMNLP 2023 + + +
+ This work investigates the computational expressivity of language models +(LMs) based on recurrent neural networks (RNNs). Siegelmann and Sontag (1992) +famously showed that RNNs with rational weights and hidden states and unbounded +computation time are Turing complete. However, LMs define weightings over +strings in addition to just (unweighted) language membership and the analysis +of the computational power of RNN LMs (RLMs) should reflect this. We extend the +Turing completeness result to the probabilistic case, showing how a rationally +weighted RLM with unbounded computation time can simulate any deterministic +probabilistic Turing machine (PTM) with rationally weighted transitions. Since, +in practice, RLMs work in real-time, processing a symbol at every time step, we +treat the above result as an upper bound on the expressivity of RLMs. We also +provide a lower bound by showing that under the restriction to real-time +computation, such models can simulate deterministic real-time rational PTMs. + +
+
+ comment: To be published at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Gates Are Not What You Need in RNNs SC 2023 + + +
+ Recurrent neural networks have flourished in many areas. Consequently, we can +see new RNN cells being developed continuously, usually by creating or using +gates in a new, original way. But what if we told you that gates in RNNs are +redundant? In this paper, we propose a new recurrent cell called Residual +Recurrent Unit (RRU) which beats traditional cells and does not employ a single +gate. It is based on the residual shortcut connection, linear transformations, +ReLU, and normalization. To evaluate our cell's effectiveness, we compare its +performance against the widely-used GRU and LSTM cells and the recently +proposed Mogrifier LSTM on several tasks including, polyphonic music modeling, +language modeling, and sentiment analysis. Our experiments show that RRU +outperforms the traditional gated units on most of these tasks. Also, it has +better robustness to parameter selection, allowing immediate application in new +tasks without much tuning. We have implemented the RRU in TensorFlow, and the +code is made available at https://github.com/LUMII-Syslab/RRU . + +
+
+ comment: Published in Artificial Intelligence and Soft Computing. ICAISC 2023. + Lecture Notes in Computer Science(), vol 14125. Springer, Cham., and is + available online at https://doi.org/10.1007/978-3-031-42505-9_27 +
+
+
+
+
+ + ♻ ☆ NeuroGraph: Benchmarks for Graph Machine Learning in Brain Connectomics NeurIPS23 + + +
+ Machine learning provides a valuable tool for analyzing high-dimensional +functional neuroimaging data, and is proving effective in predicting various +neurological conditions, psychiatric disorders, and cognitive patterns. In +functional magnetic resonance imaging (MRI) research, interactions between +brain regions are commonly modeled using graph-based representations. The +potency of graph machine learning methods has been established across myriad +domains, marking a transformative step in data interpretation and predictive +modeling. Yet, despite their promise, the transposition of these techniques to +the neuroimaging domain has been challenging due to the expansive number of +potential preprocessing pipelines and the large parameter search space for +graph-based dataset construction. In this paper, we introduce NeuroGraph, a +collection of graph-based neuroimaging datasets, and demonstrated its utility +for predicting multiple categories of behavioral and cognitive traits. We delve +deeply into the dataset generation search space by crafting 35 datasets that +encompass static and dynamic brain connectivity, running in excess of 15 +baseline methods for benchmarking. Additionally, we provide generic frameworks +for learning on both static and dynamic graphs. Our extensive experiments lead +to several key observations. Notably, using correlation vectors as node +features, incorporating larger number of regions of interest, and employing +sparser graphs lead to improved performance. To foster further advancements in +graph-based data driven neuroimaging analysis, we offer a comprehensive +open-source Python package that includes the benchmark datasets, baseline +implementations, model training, and standard evaluation. + +
+
+ comment: NeurIPS23 +
+
+
+
+
+ + ♻ ☆ BOIS: Bayesian Optimization of Interconnected Systems + + +
+ Bayesian optimization (BO) has proven to be an effective paradigm for the +global optimization of expensive-to-sample systems. One of the main advantages +of BO is its use of Gaussian processes (GPs) to characterize model uncertainty +which can be leveraged to guide the learning and search process. However, BO +typically treats systems as black-boxes and this limits the ability to exploit +structural knowledge (e.g., physics and sparse interconnections). Composite +functions of the form $f(x, y(x))$, wherein GP modeling is shifted from the +performance function $f$ to an intermediate function $y$, offer an avenue for +exploiting structural knowledge. However, the use of composite functions in a +BO framework is complicated by the need to generate a probability density for +$f$ from the Gaussian density of $y$ calculated by the GP (e.g., when $f$ is +nonlinear it is not possible to obtain a closed-form expression). Previous work +has handled this issue using sampling techniques; these are easy to implement +and flexible but are computationally intensive. In this work, we introduce a +new paradigm which allows for the efficient use of composite functions in BO; +this uses adaptive linearizations of $f$ to obtain closed-form expressions for +the statistical moments of the composite function. We show that this simple +approach (which we call BOIS) enables the exploitation of structural knowledge, +such as that arising in interconnected systems as well as systems that embed +multiple GP models and combinations of physics and GP models. Using a chemical +process optimization case study, we benchmark the effectiveness of BOIS against +standard BO and sampling approaches. Our results indicate that BOIS achieves +performance gains and accurately captures the statistics of composite +functions. + +
+
+ comment: 6 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ A General Theoretical Paradigm to Understand Learning from Human + Preferences + + +
+ The prevalent deployment of learning from human preferences through +reinforcement learning (RLHF) relies on two important approximations: the first +assumes that pairwise preferences can be substituted with pointwise rewards. +The second assumes that a reward model trained on these pointwise rewards can +generalize from collected data to out-of-distribution data sampled by the +policy. Recently, Direct Preference Optimisation (DPO) has been proposed as an +approach that bypasses the second approximation and learn directly a policy +from collected data without the reward modelling stage. However, this method +still heavily relies on the first approximation. + In this paper we try to gain a deeper theoretical understanding of these +practical algorithms. In particular we derive a new general objective called +$\Psi$PO for learning from human preferences that is expressed in terms of +pairwise preferences and therefore bypasses both approximations. This new +general objective allows us to perform an in-depth analysis of the behavior of +RLHF and DPO (as special cases of $\Psi$PO) and to identify their potential +pitfalls. We then consider another special case for $\Psi$PO by setting $\Psi$ +simply to Identity, for which we can derive an efficient optimisation +procedure, prove performance guarantees and demonstrate its empirical +superiority to DPO on some illustrative examples. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ CompenHR: Efficient Full Compensation for High-resolution Projector + + +
+ Full projector compensation is a practical task of projector-camera systems. +It aims to find a projector input image, named compensation image, such that +when projected it cancels the geometric and photometric distortions due to the +physical environment and hardware. State-of-the-art methods use deep learning +to address this problem and show promising performance for low-resolution +setups. However, directly applying deep learning to high-resolution setups is +impractical due to the long training time and high memory cost. To address this +issue, this paper proposes a practical full compensation solution. Firstly, we +design an attention-based grid refinement network to improve geometric +correction quality. Secondly, we integrate a novel sampling scheme into an +end-to-end compensation network to alleviate computation and introduce +attention blocks to preserve key features. Finally, we construct a benchmark +dataset for high-resolution projector full compensation. In experiments, our +method demonstrates clear advantages in both efficiency and quality. + +
+
+
+
+
+ + ☆ Rethinking Radiology Report Generation via Causal Reasoning and + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as an interaction between +vision and language fields. Previous works inherited the ideology of +vision-to-language generation tasks,aiming to generate paragraphs with high +consistency as reports. However, one unique characteristic of RRG, the +independence between diseases, was neglected, leading to the injection of the +spurious confounder, i.e., the disease co-occurrence. Unfortunately, this +confounder confuses the process of report generation worse because of the +biased RRG data distribution. In this paper, to rethink this issue thoroughly, +we reason about its causes and effects from a novel perspective of statistics +and causality, where the Joint Vision Coupling and the Conditional Sentence +Coherence Coupling are two aspects prone to implicitly decrease the accuracy of +reports. Then, a counterfactual augmentation strategy that contains the +Counterfactual Sample Synthesis and the Counterfactual Report Reconstruction +sub-methods is proposed to break these two aspects of spurious effects. +Experimental results and further analyses on two widely used datasets justify +our reasoning and proposed methods. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ☆ FusionFrames: Efficient Architectural Aspects for Text-to-Video + Generation Pipeline + + +
+ Multimedia generation approaches occupy a prominent place in artificial +intelligence research. Text-to-image models achieved high-quality results over +the last few years. However, video synthesis methods recently started to +develop. This paper presents a new two-stage latent diffusion text-to-video +generation architecture based on the text-to-image diffusion model. The first +stage concerns keyframes synthesis to figure the storyline of a video, while +the second one is devoted to interpolation frames generation to make movements +of the scene and objects smooth. We compare several temporal conditioning +approaches for keyframes generation. The results show the advantage of using +separate temporal blocks over temporal layers in terms of metrics reflecting +video generation quality aspects and human preference. The design of our +interpolation model significantly reduces computational costs compared to other +masked frame interpolation approaches. Furthermore, we evaluate different +configurations of MoVQ-based video decoding scheme to improve consistency and +achieve higher PSNR, SSIM, MSE, and LPIPS scores. Finally, we compare our +pipeline with existing solutions and achieve top-2 scores overall and top-1 +among open-source solutions: CLIPSIM = 0.2976 and FVD = 433.054. Project page: +https://ai-forever.github.io/kandinsky-video/ + +
+
+ comment: Project page: https://ai-forever.github.io/kandinsky-video/ +
+
+
+
+
+ + ☆ Beat-Aligned Spectrogram-to-Sequence Generation of Rhythm-Game Charts + + +
+ In the heart of "rhythm games" - games where players must perform actions in +sync with a piece of music - are "charts", the directives to be given to +players. We newly formulate chart generation as a sequence generation task and +train a Transformer using a large dataset. We also introduce tempo-informed +preprocessing and training procedures, some of which are suggested to be +integral for a successful training. Our model is found to outperform the +baselines on a large dataset, and is also found to benefit from pretraining and +finetuning. + +
+
+ comment: ISMIR 2023 LBD. Demo videos and code at stet-stet.github.io/goct +
+
+
+
+
+ + ♻ ☆ GA2MIF: Graph and Attention Based Two-Stage Multi-Source Information + Fusion for Conversational Emotion Detection + + +
+ Multimodal Emotion Recognition in Conversation (ERC) plays an influential +role in the field of human-computer interaction and conversational robotics +since it can motivate machines to provide empathetic services. Multimodal data +modeling is an up-and-coming research area in recent years, which is inspired +by human capability to integrate multiple senses. Several graph-based +approaches claim to capture interactive information between modalities, but the +heterogeneity of multimodal data makes these methods prohibit optimal +solutions. In this work, we introduce a multimodal fusion approach named Graph +and Attention based Two-stage Multi-source Information Fusion (GA2MIF) for +emotion detection in conversation. Our proposed method circumvents the problem +of taking heterogeneous graph as input to the model while eliminating complex +redundant connections in the construction of graph. GA2MIF focuses on +contextual modeling and cross-modal modeling through leveraging Multi-head +Directed Graph ATtention networks (MDGATs) and Multi-head Pairwise Cross-modal +ATtention networks (MPCATs), respectively. Extensive experiments on two public +datasets (i.e., IEMOCAP and MELD) demonstrate that the proposed GA2MIF has the +capacity to validly capture intra-modal long-range contextual information and +inter-modal complementary information, as well as outperforms the prevalent +State-Of-The-Art (SOTA) models by a remarkable margin. + +
+
+ comment: Accepted by IEEE Transactions on Affective Computing +
+
+
+
+
+ + ♻ ☆ GraphCFC: A Directed Graph Based Cross-Modal Feature Complementation + Approach for Multimodal Conversational Emotion Recognition + + +
+ Emotion Recognition in Conversation (ERC) plays a significant part in +Human-Computer Interaction (HCI) systems since it can provide empathetic +services. Multimodal ERC can mitigate the drawbacks of uni-modal approaches. +Recently, Graph Neural Networks (GNNs) have been widely used in a variety of +fields due to their superior performance in relation modeling. In multimodal +ERC, GNNs are capable of extracting both long-distance contextual information +and inter-modal interactive information. Unfortunately, since existing methods +such as MMGCN directly fuse multiple modalities, redundant information may be +generated and diverse information may be lost. In this work, we present a +directed Graph based Cross-modal Feature Complementation (GraphCFC) module that +can efficiently model contextual and interactive information. GraphCFC +alleviates the problem of heterogeneity gap in multimodal fusion by utilizing +multiple subspace extractors and Pair-wise Cross-modal Complementary (PairCC) +strategy. We extract various types of edges from the constructed graph for +encoding, thus enabling GNNs to extract crucial contextual and interactive +information more accurately when performing message passing. Furthermore, we +design a GNN structure called GAT-MLP, which can provide a new unified network +framework for multimodal learning. The experimental results on two benchmark +datasets show that our GraphCFC outperforms the state-of-the-art (SOTA) +approaches. + +
+
+ comment: Accepted by IEEE Transactions on Multimedia (TMM) +
+
+
+
+
+ + ♻ ☆ LucidDreamer: Towards High-Fidelity Text-to-3D Generation via Interval + Score Matching + + +
+ The recent advancements in text-to-3D generation mark a significant milestone +in generative models, unlocking new possibilities for creating imaginative 3D +assets across various real-world scenarios. While recent advancements in +text-to-3D generation have shown promise, they often fall short in rendering +detailed and high-quality 3D models. This problem is especially prevalent as +many methods base themselves on Score Distillation Sampling (SDS). This paper +identifies a notable deficiency in SDS, that it brings inconsistent and +low-quality updating direction for the 3D model, causing the over-smoothing +effect. To address this, we propose a novel approach called Interval Score +Matching (ISM). ISM employs deterministic diffusing trajectories and utilizes +interval-based score matching to counteract over-smoothing. Furthermore, we +incorporate 3D Gaussian Splatting into our text-to-3D generation pipeline. +Extensive experiments show that our model largely outperforms the +state-of-the-art in quality and training efficiency. + +
+
+ comment: The first two authors contributed equally to this work. Our code will + be available at: https://github.com/EnVision-Research/LucidDreamer +
+
+
+
+
+ + ♻ ☆ GraphMFT: A Graph Network based Multimodal Fusion Technique for Emotion + Recognition in Conversation + + +
+ Multimodal machine learning is an emerging area of research, which has +received a great deal of scholarly attention in recent years. Up to now, there +are few studies on multimodal Emotion Recognition in Conversation (ERC). Since +Graph Neural Networks (GNNs) possess the powerful capacity of relational +modeling, they have an inherent advantage in the field of multimodal learning. +GNNs leverage the graph constructed from multimodal data to perform intra- and +inter-modal information interaction, which effectively facilitates the +integration and complementation of multimodal data. In this work, we propose a +novel Graph network based Multimodal Fusion Technique (GraphMFT) for emotion +recognition in conversation. Multimodal data can be modeled as a graph, where +each data object is regarded as a node, and both intra- and inter-modal +dependencies existing between data objects can be regarded as edges. GraphMFT +utilizes multiple improved graph attention networks to capture intra-modal +contextual information and inter-modal complementary information. In addition, +the proposed GraphMFT attempts to address the challenges of existing +graph-based multimodal conversational emotion recognition models such as MMGCN. +Empirical results on two public multimodal datasets reveal that our model +outperforms the State-Of-The-Art (SOTA) approaches with the accuracy of 67.90% +and 61.30%. + +
+
+ comment: Accepted by Neurocomputing +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 63 + +
+
+
+ + ☆ LowResource at BLP-2023 Task 2: Leveraging BanglaBert for Low Resource + Sentiment Analysis of Bangla Language EMNLP2023 + + +
+ This paper describes the system of the LowResource Team for Task 2 of +BLP-2023, which involves conducting sentiment analysis on a dataset composed of +public posts and comments from diverse social media platforms. Our primary aim +is to utilize BanglaBert, a BERT model pre-trained on a large Bangla corpus, +using various strategies including fine-tuning, dropping random tokens, and +using several external datasets. Our final model is an ensemble of the three +best BanglaBert variations. Our system has achieved overall 3rd in the Test Set +among 30 participating teams with a score of 0.718. Additionally, we discuss +the promising systems that didn't perform well namely task-adaptive pertaining +and paraphrasing using BanglaT5. Training codes and external datasets which are +used for our system are publicly available at +https://github.com/Aunabil4602/bnlp-workshop-task2-2023 + +
+
+ comment: Accepted at BLP Workshop @EMNLP2023 +
+
+
+
+
+ + ☆ Soft Random Sampling: A Theoretical and Empirical Analysis + + +
+ Soft random sampling (SRS) is a simple yet effective approach for efficient +training of large-scale deep neural networks when dealing with massive data. +SRS selects a subset uniformly at random with replacement from the full data +set in each epoch. In this paper, we conduct a theoretical and empirical +analysis of SRS. First, we analyze its sampling dynamics including data +coverage and occupancy. Next, we investigate its convergence with non-convex +objective functions and give the convergence rate. Finally, we provide its +generalization performance. We empirically evaluate SRS for image recognition +on CIFAR10 and automatic speech recognition on Librispeech and an in-house +payload dataset to demonstrate its effectiveness. Compared to existing +coreset-based data selection methods, SRS offers a better accuracy-efficiency +trade-off. Especially on real-world industrial scale data sets, it is shown to +be a powerful training strategy with significant speedup and competitive +performance with almost no additional computing cost. + +
+
+
+
+
+ + ☆ Keeping Users Engaged During Repeated Administration of the Same + Questionnaire: Using Large Language Models to Reliably Diversify Questions + + +
+ Standardized, validated questionnaires are vital tools in HCI research and +healthcare, offering dependable self-report data. However, their repeated use +in longitudinal or pre-post studies can induce respondent fatigue, impacting +data quality via response biases and decreased response rates. We propose +utilizing large language models (LLMs) to generate diverse questionnaire +versions while retaining good psychometric properties. In a longitudinal study, +participants engaged with our agent system and responded daily for two weeks to +either a standardized depression questionnaire or one of two LLM-generated +questionnaire variants, alongside a validated depression questionnaire. +Psychometric testing revealed consistent covariation between the external +criterion and the focal measure administered across the three conditions, +demonstrating the reliability and validity of the LLM-generated variants. +Participants found the repeated administration of the standardized +questionnaire significantly more repetitive compared to the variants. Our +findings highlight the potential of LLM-generated variants to invigorate +questionnaires, fostering engagement and interest without compromising +validity. + +
+
+ comment: 22 pages, preprint +
+
+
+
+
+ + ☆ Can Large Language Models Understand Content and Propagation for + Misinformation Detection: An Empirical Study + + +
+ Large Language Models (LLMs) have garnered significant attention for their +powerful ability in natural language understanding and reasoning. In this +paper, we present a comprehensive empirical study to explore the performance of +LLMs on misinformation detection tasks. This study stands as the pioneering +investigation into the understanding capabilities of multiple LLMs regarding +both content and propagation across social media platforms. Our empirical +studies on five misinformation detection datasets show that LLMs with diverse +prompts achieve comparable performance in text-based misinformation detection +but exhibit notably constrained capabilities in comprehending propagation +structure compared to existing models in propagation-based misinformation +detection. Besides, we further design four instruction-tuned strategies to +enhance LLMs for both content and propagation-based misinformation detection. +These strategies boost LLMs to actively learn effective features from multiple +instances or hard instances, and eliminate irrelevant propagation structures, +thereby achieving better detection performance. Extensive experiments further +demonstrate LLMs would play a better capacity in content and propagation +structure under these proposed strategies and achieve promising detection +performance. These findings highlight the potential ability of LLMs to detect +misinformation. + +
+
+
+
+
+ + ☆ Fair Text Classification with Wasserstein Independence + + +
+ Group fairness is a central research topic in text classification, where +reaching fair treatment between sensitive groups (e.g. women vs. men) remains +an open challenge. This paper presents a novel method for mitigating biases in +neural text classification, agnostic to the model architecture. Considering the +difficulty to distinguish fair from unfair information in a text encoder, we +take inspiration from adversarial training to induce Wasserstein independence +between representations learned to predict our target label and the ones +learned to predict some sensitive attribute. Our approach provides two +significant advantages. Firstly, it does not require annotations of sensitive +attributes in both testing and training data. This is more suitable for +real-life scenarios compared to existing methods that require annotations of +sensitive attributes at train time. Second, our approach exhibits a comparable +or better fairness-accuracy trade-off compared to existing methods. + +
+
+
+
+
+ + ☆ The DURel Annotation Tool: Human and Computational Measurement of + Semantic Proximity, Sense Clusters and Semantic Change + + +
+ We present the DURel tool that implements the annotation of semantic +proximity between uses of words into an online, open source interface. The tool +supports standardized human annotation as well as computational annotation, +building on recent advances with Word-in-Context models. Annotator judgments +are clustered with automatic graph clustering techniques and visualized for +analysis. This allows to measure word senses with simple and intuitive +micro-task judgments between use pairs, requiring minimal preparation efforts. +The tool offers additional functionalities to compare the agreement between +annotators to guarantee the inter-subjectivity of the obtained judgments and to +calculate summary statistics giving insights into sense frequency +distributions, semantic variation or changes of senses over time. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ MathGloss: Building mathematical glossaries from text + + +
+ MathGloss is a project to create a knowledge graph (KG) for undergraduate +mathematics from text, automatically, using modern natural language processing +(NLP) tools and resources already available on the web. MathGloss is a linked +database of undergraduate concepts in mathematics. So far, it combines five +resources: (i) Wikidata, a collaboratively edited, multilingual knowledge graph +hosted by the Wikimedia Foundation, (ii) terms covered in mathematics courses +at the University of Chicago, (iii) the syllabus of the French undergraduate +mathematics curriculum which includes hyperlinks to the automated theorem +prover Lean 4, (iv) MuLiMa, a multilingual dictionary of mathematics curated by +mathematicians, and (v) the nLab, a wiki for category theory also curated by +mathematicians. MathGloss's goal is to bring together resources for learning +mathematics and to allow every mathematician to tailor their learning to their +own preferences. Moreover, by organizing different resources for learning +undergraduate mathematics alongside those for learning formal mathematics, we +hope to make it easier for mathematicians and formal tools (theorem provers, +computer algebra systems, etc) experts to "understand" each other and break +down some of the barriers to formal math. + +
+
+
+
+
+ + ☆ IMGTB: A Framework for Machine-Generated Text Detection Benchmarking + + +
+ In the era of large language models generating high quality texts, it is a +necessity to develop methods for detection of machine-generated text to avoid +harmful use or simply due to annotation purposes. It is, however, also +important to properly evaluate and compare such developed methods. Recently, a +few benchmarks have been proposed for this purpose; however, integration of +newest detection methods is rather challenging, since new methods appear each +month and provide slightly different evaluation pipelines. In this paper, we +present the IMGTB framework, which simplifies the benchmarking of +machine-generated text detection methods by easy integration of custom (new) +methods and evaluation datasets. Its configurability and flexibility makes +research and development of new detection methods easier, especially their +comparison to the existing state-of-the-art detectors. The default set of +analyses, metrics and visualizations offered by the tool follows the +established practices of machine-generated text detection benchmarking found in +state-of-the-art literature. + +
+
+
+
+
+ + ☆ In-Context Learning Functions with Varying Number of Minima + + +
+ Large Language Models (LLMs) have proven effective at In-Context Learning +(ICL), an ability that allows them to create predictors from labeled examples. +Few studies have explored the interplay between ICL and specific properties of +functions it attempts to approximate. In our study, we use a formal framework +to explore ICL and propose a new task of approximating functions with varying +number of minima. We implement a method that allows for producing functions +with given inputs as minima. We find that increasing the number of minima +degrades ICL performance. At the same time, our evaluation shows that ICL +outperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster +than 2NN in all settings. We validate the findings through a set of few-shot +experiments across various hyperparameter configurations. + +
+
+
+
+
+ + ☆ Oasis: Data Curation and Assessment System for Pretraining of Large + Language Models + + +
+ Data is one of the most critical elements in building a large language model. +However, existing systems either fail to customize a corpus curation pipeline +or neglect to leverage comprehensive corpus assessment for iterative +optimization of the curation. To this end, we present a pretraining corpus +curation and assessment platform called Oasis -- a one-stop system for data +quality improvement and quantification with user-friendly interactive +interfaces. Specifically, the interactive modular rule filter module can devise +customized rules according to explicit feedback. The debiased neural filter +module builds the quality classification dataset in a negative-centric manner +to remove the undesired bias. The adaptive document deduplication module could +execute large-scale deduplication with limited memory resources. These three +parts constitute the customized data curation module. And in the holistic data +assessment module, a corpus can be assessed in local and global views, with +three evaluation means including human, GPT-4, and heuristic metrics. We +exhibit a complete process to use Oasis for the curation and assessment of +pretraining data. In addition, an 800GB bilingual corpus curated by Oasis is +publicly released. + +
+
+
+
+
+ + ☆ Evaluation Metrics of Language Generation Models for Synthetic Traffic + Generation Tasks + + +
+ Many Natural Language Generation (NLG) tasks aim to generate a single output +text given an input prompt. Other settings require the generation of multiple +texts, e.g., for Synthetic Traffic Generation (STG). This generation task is +crucial for training and evaluating QA systems as well as conversational +agents, where the goal is to generate multiple questions or utterances +resembling the linguistic variability of real users. In this paper, we show +that common NLG metrics, like BLEU, are not suitable for evaluating STG. We +propose and evaluate several metrics designed to compare the generated traffic +to the distribution of real user texts. We validate our metrics with an +automatic procedure to verify whether they capture different types of quality +issues of generated data; we also run human annotations to verify the +correlation with human judgements. Experiments on three tasks, i.e., Shopping +Utterance Generation, Product Question Generation and Query Auto Completion, +demonstrate that our metrics are effective for evaluating STG tasks, and +improve the agreement with human judgement up to 20% with respect to common NLG +metrics. We believe these findings can pave the way towards better solutions +for estimating the representativeness of synthetic text data. + +
+
+
+
+
+ + ☆ Multilingual Word Embeddings for Low-Resource Languages using Anchors + and a Chain of Related Languages + + +
+ Very low-resource languages, having only a few million tokens worth of data, +are not well-supported by multilingual NLP approaches due to poor quality +cross-lingual word representations. Recent work showed that good cross-lingual +performance can be achieved if a source language is related to the low-resource +target language. However, not all language pairs are related. In this paper, we +propose to build multilingual word embeddings (MWEs) via a novel language +chain-based approach, that incorporates intermediate related languages to +bridge the gap between the distant source and target. We build MWEs one +language at a time by starting from the resource rich source and sequentially +adding each language in the chain till we reach the target. We extend a +semi-joint bilingual approach to multiple languages in order to eliminate the +main weakness of previous works, i.e., independently trained monolingual +embeddings, by anchoring the target language around the multilingual space. We +evaluate our method on bilingual lexicon induction for 4 language families, +involving 4 very low-resource (<5M tokens) and 4 moderately low-resource (<50M) +target languages, showing improved performance in both categories. +Additionally, our analysis reveals the importance of good quality embeddings +for intermediate languages as well as the importance of leveraging anchor +points from all languages in the multilingual space. + +
+
+ comment: Accepted at the MRL 2023 workshop +
+
+
+
+
+ + ☆ Speaker-Adapted End-to-End Visual Speech Recognition for Continuous + Spanish + + +
+ Different studies have shown the importance of visual cues throughout the +speech perception process. In fact, the development of audiovisual approaches +has led to advances in the field of speech technologies. However, although +noticeable results have recently been achieved, visual speech recognition +remains an open research problem. It is a task in which, by dispensing with the +auditory sense, challenges such as visual ambiguities and the complexity of +modeling silence must be faced. Nonetheless, some of these challenges can be +alleviated when the problem is approached from a speaker-dependent perspective. +Thus, this paper studies, using the Spanish LIP-RTVE database, how the +estimation of specialized end-to-end systems for a specific person could affect +the quality of speech recognition. First, different adaptation strategies based +on the fine-tuning technique were proposed. Then, a pre-trained CTC/Attention +architecture was used as a baseline throughout our experiments. Our findings +showed that a two-step fine-tuning process, where the VSR system is first +adapted to the task domain, provided significant improvements when the speaker +adaptation was addressed. Furthermore, results comparable to the current state +of the art were reached even when only a limited amount of data was available. + +
+
+ comment: Accepted in Proceedings of IberSpeech 2022 ( + https://www.isca-speech.org/archive/iberspeech_2022/gimenogomez22_iberspeech.html + ) +
+
+
+
+
+ + ☆ PhayaThaiBERT: Enhancing a Pretrained Thai Language Model with + Unassimilated Loanwords + + +
+ While WangchanBERTa has become the de facto standard in transformer-based +Thai language modeling, it still has shortcomings in regard to the +understanding of foreign words, most notably English words, which are often +borrowed without orthographic assimilation into Thai in many contexts. We +identify the lack of foreign vocabulary in WangchanBERTa's tokenizer as the +main source of these shortcomings. We then expand WangchanBERTa's vocabulary +via vocabulary transfer from XLM-R's pretrained tokenizer and pretrain a new +model using the expanded tokenizer, starting from WangchanBERTa's checkpoint, +on a new dataset that is larger than the one used to train WangchanBERTa. Our +results show that our new pretrained model, PhayaThaiBERT, outperforms +WangchanBERTa in many downstream tasks and datasets. + +
+
+
+
+
+ + ☆ CSMeD: Bridging the Dataset Gap in Automated Citation Screening for + Systematic Literature Reviews NeurIPS 2023 + + +
+ Systematic literature reviews (SLRs) play an essential role in summarising, +synthesising and validating scientific evidence. In recent years, there has +been a growing interest in using machine learning techniques to automate the +identification of relevant studies for SLRs. However, the lack of standardised +evaluation datasets makes comparing the performance of such automated +literature screening systems difficult. In this paper, we analyse the citation +screening evaluation datasets, revealing that many of the available datasets +are either too small, suffer from data leakage or have limited applicability to +systems treating automated literature screening as a classification task, as +opposed to, for example, a retrieval or question-answering task. To address +these challenges, we introduce CSMeD, a meta-dataset consolidating nine +publicly released collections, providing unified access to 325 SLRs from the +fields of medicine and computer science. CSMeD serves as a comprehensive +resource for training and evaluating the performance of automated citation +screening models. Additionally, we introduce CSMeD-FT, a new dataset designed +explicitly for evaluating the full text publication screening task. To +demonstrate the utility of CSMeD, we conduct experiments and establish +baselines on new datasets. + +
+
+ comment: Accepted at NeurIPS 2023 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ Analysis of Visual Features for Continuous Lipreading in Spanish + + +
+ During a conversation, our brain is responsible for combining information +obtained from multiple senses in order to improve our ability to understand the +message we are perceiving. Different studies have shown the importance of +presenting visual information in these situations. Nevertheless, lipreading is +a complex task whose objective is to interpret speech when audio is not +available. By dispensing with a sense as crucial as hearing, it will be +necessary to be aware of the challenge that this lack presents. In this paper, +we propose an analysis of different speech visual features with the intention +of identifying which of them is the best approach to capture the nature of lip +movements for natural Spanish and, in this way, dealing with the automatic +visual speech recognition task. In order to estimate our system, we present an +audiovisual corpus compiled from a subset of the RTVE database, which has been +used in the Albayz\'in evaluations. We employ a traditional system based on +Hidden Markov Models with Gaussian Mixture Models. Results show that, although +the task is difficult, in restricted conditions we obtain recognition results +which determine that using eigenlips in combination with deep features is the +best visual approach. + +
+
+ comment: Accepted in Proceedings of IberSpeech 2020 ( + https://www.isca-speech.org/archive/iberspeech_2021/gimenogomez21_iberspeech.html + ) +
+
+
+
+
+ + ☆ LIP-RTVE: An Audiovisual Database for Continuous Spanish in the Wild LREC 2022 + + +
+ Speech is considered as a multi-modal process where hearing and vision are +two fundamentals pillars. In fact, several studies have demonstrated that the +robustness of Automatic Speech Recognition systems can be improved when audio +and visual cues are combined to represent the nature of speech. In addition, +Visual Speech Recognition, an open research problem whose purpose is to +interpret speech by reading the lips of the speaker, has been a focus of +interest in the last decades. Nevertheless, in order to estimate these systems +in the currently Deep Learning era, large-scale databases are required. On the +other hand, while most of these databases are dedicated to English, other +languages lack sufficient resources. Thus, this paper presents a +semi-automatically annotated audiovisual database to deal with unconstrained +natural Spanish, providing 13 hours of data extracted from Spanish television. +Furthermore, baseline results for both speaker-dependent and +speaker-independent scenarios are reported using Hidden Markov Models, a +traditional paradigm that has been widely used in the field of Speech +Technologies. + +
+
+ comment: Accepted in Proceedings of LREC 2022 ( + https://aclanthology.org/2022.lrec-1.294 ) +
+
+
+
+
+ + ☆ How Far Have We Gone in Vulnerability Detection Using Large Language + Models + + +
+ As software becomes increasingly complex and prone to vulnerabilities, +automated vulnerability detection is critically important, yet challenging. +Given the significant successes of Large Language Models (LLMs) in various +tasks, there is growing anticipation of their efficacy in vulnerability +detection. However, a quantitative understanding of their potential in +vulnerability detection is still missing. To bridge this gap, we introduce a +comprehensive vulnerability benchmark VulBench. This benchmark aggregates +high-quality data from a wide range of CTF (Capture-the-Flag) challenges and +real-world applications, with annotations for each vulnerable function +detailing the vulnerability type and its root cause. Through our experiments +encompassing 16 LLMs and 6 state-of-the-art (SOTA) deep learning-based models +and static analyzers, we find that several LLMs outperform traditional deep +learning approaches in vulnerability detection, revealing an untapped potential +in LLMs. This work contributes to the understanding and utilization of LLMs for +enhanced software security. + +
+
+
+
+
+ + ☆ Visual Analytics for Generative Transformer Models + + +
+ While transformer-based models have achieved state-of-the-art results in a +variety of classification and generation tasks, their black-box nature makes +them challenging for interpretability. In this work, we present a novel visual +analytical framework to support the analysis of transformer-based generative +networks. In contrast to previous work, which has mainly focused on +encoder-based models, our framework is one of the first dedicated to supporting +the analysis of transformer-based encoder-decoder models and decoder-only +models for generative and classification tasks. Hence, we offer an intuitive +overview that allows the user to explore different facets of the model through +interactive visualization. To demonstrate the feasibility and usefulness of our +framework, we present three detailed case studies based on real-world NLP +research problems. + +
+
+ comment: 6 pages (reference excluded), 7 figures +
+
+
+
+
+ + ☆ nach0: Multimodal Natural and Chemical Languages Foundation Model + + +
+ Large Language Models (LLMs) have substantially driven scientific progress in +various domains, and many papers have demonstrated their ability to tackle +complex problems with creative solutions. Our paper introduces a new foundation +model, nach0, capable of solving various chemical and biological tasks: +biomedical question answering, named entity recognition, molecular generation, +molecular synthesis, attributes prediction, and others. nach0 is a multi-domain +and multi-task encoder-decoder LLM pre-trained on unlabeled text from +scientific literature, patents, and molecule strings to incorporate a range of +chemical and linguistic knowledge. We employed instruction tuning, where +specific task-related instructions are utilized to fine-tune nach0 for the +final set of tasks. To train nach0 effectively, we leverage the NeMo framework, +enabling efficient parallel optimization of both base and large model versions. +Extensive experiments demonstrate that our model outperforms state-of-the-art +baselines on single-domain and cross-domain tasks. Furthermore, it can generate +high-quality outputs in molecular and textual formats, showcasing its +effectiveness in multi-domain setups. + +
+
+ comment: Submitted to Nature Communications +
+
+
+
+
+ + ☆ IndoRobusta: Towards Robustness Against Diverse Code-Mixed Indonesian + Local Languages + + +
+ Significant progress has been made on Indonesian NLP. Nevertheless, +exploration of the code-mixing phenomenon in Indonesian is limited, despite +many languages being frequently mixed with Indonesian in daily conversation. In +this work, we explore code-mixing in Indonesian with four embedded languages, +i.e., English, Sundanese, Javanese, and Malay; and introduce IndoRobusta, a +framework to evaluate and improve the code-mixing robustness. Our analysis +shows that the pre-training corpus bias affects the model's ability to better +handle Indonesian-English code-mixing when compared to other local languages, +despite having higher language diversity. + +
+
+
+
+
+ + ☆ InterPrompt: Interpretable Prompting for Interrelated Interpersonal Risk + Factors in Reddit Posts + + +
+ Mental health professionals and clinicians have observed the upsurge of +mental disorders due to Interpersonal Risk Factors (IRFs). To simulate the +human-in-the-loop triaging scenario for early detection of mental health +disorders, we recognized textual indications to ascertain these IRFs : Thwarted +Belongingness (TBe) and Perceived Burdensomeness (PBu) within personal +narratives. In light of this, we use N-shot learning with GPT-3 model on the +IRF dataset, and underscored the importance of fine-tuning GPT-3 model to +incorporate the context-specific sensitivity and the interconnectedness of +textual cues that represent both IRFs. + In this paper, we introduce an Interpretable Prompting (InterPrompt)} method +to boost the attention mechanism by fine-tuning the GPT-3 model. This allows a +more sophisticated level of language modification by adjusting the pre-trained +weights. Our model learns to detect usual patterns and underlying connections +across both the IRFs, which leads to better system-level explainability and +trustworthiness. The results of our research demonstrate that all four variants +of GPT-3 model, when fine-tuned with InterPrompt, perform considerably better +as compared to the baseline methods, both in terms of classification and +explanation generation. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ A Survey of Graph Meets Large Language Model: Progress and Future + Directions + + +
+ Graph plays a significant role in representing and analyzing complex +relationships in real-world applications such as citation networks, social +networks, and biological data. Recently, Large Language Models (LLMs), which +have achieved tremendous success in various domains, have also been leveraged +in graph-related tasks to surpass traditional Graph Neural Networks (GNNs) +based methods and yield state-of-the-art performance. In this survey, we first +present a comprehensive review and analysis of existing methods that integrate +LLMs with graphs. First of all, we propose a new taxonomy, which organizes +existing methods into three categories based on the role (i.e., enhancer, +predictor, and alignment component) played by LLMs in graph-related tasks. Then +we systematically survey the representative methods along the three categories +of the taxonomy. Finally, we discuss the remaining limitations of existing +studies and highlight promising avenues for future research. The relevant +papers are summarized and will be consistently updated at: +https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks. + +
+
+ comment: Work in progress; 13 pages, 5 figures +
+
+
+
+
+ + ☆ Problems of Non-equivalent Words in Technical Translation + + +
+ Translating words which do not have equivalent in target language is not easy +and finding proper equivalent of those words are very important to render +correctly and understandably, the article defines some thoughts and ideas of +scientists on the common problems of non-equivalent words from English to +Russian language and includes English and Russian examples and ideas of certain +scientist. The English language is worldwide spoken and there are 1.35 billion +English speakers and over 258 million Russian speakers according to the 2021s +statistics. Inevitably, these billions of speakers around the world have +connection and they may have deal in different criteria. In order to understand +one another they need to have a pure and fully-understood language. These pure +languages understanding directly relates to translation knowledge where +linguists and translators need to work and research to eradicate +misunderstanding. Misunderstandings mostly appear in non-equivalent words +because there are different local and internal words like food, garment, +cultural and traditional words and others in every notion. Truly, most of these +words do not have equivalent in the target language and these words need to be +worked and find their equivalent in the target language to fully understand the +both languages. However, some of these non-equivalent words are already +professionally rendered to the target language but still there many other words +to be rendered. Hence, this research paper includes different ways and rules of +rendering non-equivalent words from source language to the target language. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ The Obscure Limitation of Modular Multilingual Language Models + + +
+ We expose the limitation of modular multilingual language models (MLMs) in +multilingual inference scenarios with unknown languages. Existing evaluations +of modular MLMs exclude the involvement of language identification (LID) +modules, which obscures the performance of real-case multilingual scenarios of +modular MLMs. In this work, we showcase the effect of adding LID on the +multilingual evaluation of modular MLMs and provide discussions for closing the +performance gap of caused by the pipelined approach of LID and modular MLMs. + +
+
+
+
+
+ + ☆ Beyond Turing: A Comparative Analysis of Approaches for Detecting + Machine-Generated Text + + +
+ Significant progress has been made on text generation by pre-trained language +models (PLMs), yet distinguishing between human and machine-generated text +poses an escalating challenge. This paper offers an in-depth evaluation of +three distinct methods used to address this task: traditional shallow learning, +Language Model (LM) fine-tuning, and Multilingual Model fine-tuning. These +approaches are rigorously tested on a wide range of machine-generated texts, +providing a benchmark of their competence in distinguishing between +human-authored and machine-authored linguistic constructs. The results reveal +considerable differences in performance across methods, thus emphasizing the +continued need for advancement in this crucial area of NLP. This study offers +valuable insights and paves the way for future research aimed at creating +robust and highly discriminative models. + +
+
+
+
+
+ + ☆ Utilizing Language Models for Tour Itinerary Recommendation IJCAI 2023 + + +
+ Tour itinerary recommendation involves planning a sequence of relevant +Point-of-Interest (POIs), which combines challenges from the fields of both +Operations Research (OR) and Recommendation Systems (RS). As an OR problem, +there is the need to maximize a certain utility (e.g., popularity of POIs in +the tour) while adhering to some constraints (e.g., maximum time for the tour). +As a RS problem, it is heavily related to problem or filtering or ranking a +subset of POIs that are relevant to a user and recommending it as part of an +itinerary. In this paper, we explore the use of language models for the task of +tour itinerary recommendation and planning. This task has the unique +requirement of recommending personalized POIs relevant to users and planning +these POIs as an itinerary that satisfies various constraints. We discuss some +approaches in this area, such as using word embedding techniques like Word2Vec +and GloVe for learning POI embeddings and transformer-based techniques like +BERT for generating + itineraries. + +
+
+ comment: PMAI23 @IJCAI 2023 2nd International Workshop on Process Management + in the AI era +
+
+
+
+
+ + ☆ Advancing Transformer Architecture in Long-Context Large Language + Models: A Comprehensive Survey + + +
+ With the bomb ignited by ChatGPT, Transformer-based Large Language Models +(LLMs) have paved a revolutionary path toward Artificial General Intelligence +(AGI) and have been applied in diverse areas as knowledge bases, human +interfaces, and dynamic agents. However, a prevailing limitation exists: many +current LLMs, constrained by resources, are primarily pre-trained on shorter +texts, rendering them less effective for longer-context prompts, commonly +encountered in real-world settings. In this paper, we present a comprehensive +survey focusing on the advancement of model architecture in Transformer-based +LLMs to optimize long-context capabilities across all stages from pre-training +to inference. We firstly delineate and analyze the problems of handling +long-context input and output with the current Transformer-based models. Then, +we mainly offer a holistic taxonomy to navigate the landscape of Transformer +upgrades on architecture to solve these problems. Afterward, we provide the +investigation on wildly used evaluation necessities tailored for long-context +LLMs, including datasets, metrics, and baseline models, as well as some amazing +optimization toolkits like libraries, systems, and compilers to augment LLMs' +efficiency and efficacy across different stages. Finally, we further discuss +the predominant challenges and potential avenues for future research in this +domain. Additionally, we have established a repository where we curate relevant +literature with real-time updates at +https://github.com/Strivin0311/long-llms-learning. + +
+
+ comment: 35 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ Do Smaller Language Models Answer Contextualised Questions Through + Memorisation Or Generalisation? + + +
+ A distinction is often drawn between a model's ability to predict a label for +an evaluation sample that is directly memorised from highly similar training +samples versus an ability to predict the label via some method of +generalisation. In the context of using Language Models for question-answering, +discussion continues to occur as to the extent to which questions are answered +through memorisation. We consider this issue for questions that would ideally +be answered through reasoning over an associated context. We propose a method +of identifying evaluation samples for which it is very unlikely our model would +have memorised the answers. Our method is based on semantic similarity of input +tokens and label tokens between training and evaluation samples. We show that +our method offers advantages upon some prior approaches in that it is able to +surface evaluation-train pairs that have overlap in either contiguous or +discontiguous sequences of tokens. We use this method to identify unmemorisable +subsets of our evaluation datasets. We train two Language Models in a multitask +fashion whereby the second model differs from the first only in that it has two +additional datasets added to the training regime that are designed to impart +simple numerical reasoning strategies of a sort known to improve performance on +some of our evaluation datasets but not on others. We then show that there is +performance improvement between the two models on the unmemorisable subsets of +the evaluation datasets that were expected to benefit from the additional +training datasets. Specifically, performance on unmemorisable subsets of two of +our evaluation datasets, DROP and ROPES significantly improves by 9.0%, and +25.7% respectively while other evaluation datasets have no significant change +in performance. + +
+
+
+
+
+ + ☆ Modeling Political Orientation of Social Media Posts: An Extended + Analysis + + +
+ Developing machine learning models to characterize political polarization on +online social media presents significant challenges. These challenges mainly +stem from various factors such as the lack of annotated data, presence of noise +in social media datasets, and the sheer volume of data. The common research +practice typically examines the biased structure of online user communities for +a given topic or qualitatively measuring the impacts of polarized topics on +social media. However, there is limited work focusing on analyzing polarization +at the ground-level, specifically in the social media posts themselves. Such +existing analysis heavily relies on annotated data, which often requires +laborious human labeling, offers labels only to specific problems, and lacks +the ability to determine the near-future bias state of a social media +conversations. Understanding the degree of political orientation conveyed in +social media posts is crucial for quantifying the bias of online user +communities and investigating the spread of polarized content. In this work, we +first introduce two heuristic methods that leverage on news media bias and post +content to label social media posts. Next, we compare the efficacy and quality +of heuristically labeled dataset with a randomly sampled human-annotated +dataset. Additionally, we demonstrate that current machine learning models can +exhibit improved performance in predicting political orientation of social +media posts, employing both traditional supervised learning and few-shot +learning setups. We conduct experiments using the proposed heuristic methods +and machine learning approaches to predict the political orientation of posts +collected from two social media forums with diverse political ideologies: Gab +and Twitter. + +
+
+
+
+
+ + ☆ AcademicGPT: Empowering Academic Research + + +
+ Large Language Models (LLMs) have demonstrated exceptional capabilities +across various natural language processing tasks. Yet, many of these advanced +LLMs are tailored for broad, general-purpose applications. In this technical +report, we introduce AcademicGPT, designed specifically to empower academic +research. AcademicGPT is a continual training model derived from LLaMA2-70B. +Our training corpus mainly consists of academic papers, thesis, content from +some academic domain, high-quality Chinese data and others. While it may not be +extensive in data scale, AcademicGPT marks our initial venture into a +domain-specific GPT tailored for research area. We evaluate AcademicGPT on +several established public benchmarks such as MMLU and CEval, as well as on +some specialized academic benchmarks like PubMedQA, SCIEval, and our +newly-created ComputerScienceQA, to demonstrate its ability from general +knowledge ability, to Chinese ability, and to academic ability. Building upon +AcademicGPT's foundation model, we also developed several applications catered +to the academic area, including General Academic Question Answering, +AI-assisted Paper Reading, Paper Review, and AI-assisted Title and Abstract +Generation. + +
+
+ comment: Technical Report. arXiv admin note: text overlap with + arXiv:2310.12081, arXiv:2310.10053 by other authors +
+
+
+
+
+ + ☆ Noise in Relation Classification Dataset TACRED: Characterization and + Reduction + + +
+ The overarching objective of this paper is two-fold. First, to explore +model-based approaches to characterize the primary cause of the noise. in the +RE dataset TACRED Second, to identify the potentially noisy instances. Towards +the first objective, we analyze predictions and performance of state-of-the-art +(SOTA) models to identify the root cause of noise in the dataset. Our analysis +of TACRED shows that the majority of the noise in the dataset originates from +the instances labeled as no-relation which are negative examples. For the +second objective, we explore two nearest-neighbor-based strategies to +automatically identify potentially noisy examples for elimination and +reannotation. Our first strategy, referred to as Intrinsic Strategy (IS), is +based on the assumption that positive examples are clean. Thus, we have used +false-negative predictions to identify noisy negative examples. Whereas, our +second approach, referred to as Extrinsic Strategy, is based on using a clean +subset of the dataset to identify potentially noisy negative examples. Finally, +we retrained the SOTA models on the eliminated and reannotated dataset. Our +empirical results based on two SOTA models trained on TACRED-E following the IS +show an average 4% F1-score improvement, whereas reannotation (TACRED-R) does +not improve the original results. However, following ES, SOTA models show the +average F1-score improvement of 3.8% and 4.4% when trained on respective +eliminated (TACRED-EN) and reannotated (TACRED-RN) datasets respectively. We +further extended the ES for cleaning positive examples as well, which resulted +in an average performance improvement of 5.8% and 5.6% for the eliminated +(TACRED-ENP) and reannotated (TACRED-RNP) datasets respectively. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ ATLANTIC: Structure-Aware Retrieval-Augmented Language Model for + Interdisciplinary Science + + +
+ Large language models record impressive performance on many natural language +processing tasks. However, their knowledge capacity is limited to the +pretraining corpus. Retrieval augmentation offers an effective solution by +retrieving context from external knowledge sources to complement the language +model. However, existing retrieval augmentation techniques ignore the +structural relationships between these documents. Furthermore, retrieval models +are not explored much in scientific tasks, especially in regard to the +faithfulness of retrieved documents. In this paper, we propose a novel +structure-aware retrieval augmented language model that accommodates document +structure during retrieval augmentation. We create a heterogeneous document +graph capturing multiple types of relationships (e.g., citation, co-authorship, +etc.) that connect documents from more than 15 scientific disciplines (e.g., +Physics, Medicine, Chemistry, etc.). We train a graph neural network on the +curated document graph to act as a structural encoder for the corresponding +passages retrieved during the model pretraining. Particularly, along with text +embeddings of the retrieved passages, we obtain structural embeddings of the +documents (passages) and fuse them together before feeding them to the language +model. We evaluate our model extensively on various scientific benchmarks that +include science question-answering and scientific document classification +tasks. Experimental results demonstrate that structure-aware retrieval improves +retrieving more coherent, faithful and contextually relevant passages, while +showing a comparable performance in the overall accuracy. + +
+
+
+
+
+ + ☆ Enabling On-Device Large Language Model Personalization with + Self-Supervised Data Selection and Synthesis + + +
+ After a large language model (LLM) is deployed on edge devices, it is +desirable for these devices to learn from user-generated conversation data to +generate user-specific and personalized responses in real-time. However, +user-generated data usually contains sensitive and private information, and +uploading such data to the cloud for annotation is not preferred if not +prohibited. While it is possible to obtain annotation locally by directly +asking users to provide preferred responses, such annotations have to be sparse +to not affect user experience. In addition, the storage of edge devices is +usually too limited to enable large-scale fine-tuning with full user-generated +data. It remains an open question how to enable on-device LLM personalization, +considering sparse annotation and limited on-device storage. In this paper, we +propose a novel framework to select and store the most representative data +online in a self-supervised way. Such data has a small memory footprint and +allows infrequent requests of user annotations for further fine-tuning. To +enhance fine-tuning quality, multiple semantically similar pairs of question +texts and expected responses are generated using the LLM. Our experiments show +that the proposed framework achieves the best user-specific content-generating +capability (accuracy) and fine-tuning speed (performance) compared with vanilla +baselines. To the best of our knowledge, this is the very first on-device LLM +personalization framework. + +
+
+ comment: 6 pages, 3 figures, 3 tables +
+
+
+
+
+ + ☆ Attribution and Alignment: Effects of Local Context Repetition on + Utterance Production and Comprehension in Dialogue CoNLL 2023 + + +
+ Language models are often used as the backbone of modern dialogue systems. +These models are pre-trained on large amounts of written fluent language. +Repetition is typically penalised when evaluating language model generations. +However, it is a key component of dialogue. Humans use local and partner +specific repetitions; these are preferred by human users and lead to more +successful communication in dialogue. In this study, we evaluate (a) whether +language models produce human-like levels of repetition in dialogue, and (b) +what are the processing mechanisms related to lexical re-use they use during +comprehension. We believe that such joint analysis of model production and +comprehension behaviour can inform the development of cognitively inspired +dialogue generation systems. + +
+
+ comment: CoNLL 2023 +
+
+
+
+
+ + ☆ Beyond Text: Unveiling Multimodal Proficiency of Large Language Models + with MultiAPI Benchmark + + +
+ The proliferation of Large Language Models like ChatGPT has significantly +advanced language understanding and generation, impacting a broad spectrum of +applications. However, these models predominantly excel in text-based tasks, +overlooking the complexity of real-world multimodal information. This study +introduces MultiAPI, a pioneering comprehensive large-scale API benchmark +dataset aimed at expanding LLMs' proficiency in multimodal contexts. Developed +collaboratively through ChatGPT, MultiAPI consists of 235 diverse API calls and +2,038 contextual prompts, offering a unique platform evaluation of +tool-augmented LLMs handling multimodal tasks. Through comprehensive +experiments, our findings reveal that while LLMs demonstrate proficiency in API +call decision-making, they face challenges in domain identification, function +selection, and argument generation. What's more, we surprisingly notice that +auxiliary context can actually impair the performance. An in-depth error +analysis paves the way for a new paradigm to address these challenges, +suggesting a potential direction for future LLM research. + +
+
+ comment: Work in Progress +
+
+
+
+
+ + ☆ Systematic word meta-sense extension + + +
+ The meaning of polysemous words often varies in a highly productive yet +predictable way. Generalizing the regularity between conventional senses to +derive novel word meaning is crucial for automated processing of non-literal +language uses such as figurative expressions. We introduce a novel task called +systematic word meta-sense extension (SWORME) to test and improve language +models' ability to extend word meaning to denote new semantic domains (also +called meta-senses) that bear regular semantic relations with existing senses. +We found that language models prefer incremental lexical semantic change toward +conceptually similar meta-senses such as logical metonymy, and are much worse +at predicting highly non-literal meaning extensions such as metaphors. We +propose a novel analogy-based method of word meaning extension, and show that +it effectively improves language model systematicity in making both gradual and +radical types of meta-sense extension. We further demonstrate that learning +systematic meta-sense extensions benefits language models on multiple +benchmarks of figurative language understanding. + +
+
+
+
+
+ + ☆ Unsupervised Graph Attention Autoencoder for Attributed Networks using + K-means Loss + + +
+ Multimodal Sentiment Analysis (MSA) has recently become a centric research +direction for many real-world applications. This proliferation is due to the +fact that opinions are central to almost all human activities and are key +influencers of our behaviors. In addition, the recent deployment of Deep +Learning-based (DL) models has proven their high efficiency for a wide range of +Western languages. In contrast, Arabic DL-based multimodal sentiment analysis +(MSA) is still in its infantile stage due, mainly, to the lack of standard +datasets. % The contribution In this paper, our investigation is twofold. +First, we design a pipeline that helps building our Arabic Multimodal dataset +leveraging both state-of-the-art transformers and feature extraction tools +within word alignment techniques. Thereafter, we validate our dataset using +state-of-the-art transformer-based model dealing with multimodality. Despite +the small size of the outcome dataset, experiments show that Arabic +multimodality is very promising. + +
+
+ comment: 7 pages, 5 Figures +
+
+
+
+
+ + GAIA: a benchmark for General AI Assistants + + +
+ We introduce GAIA, a benchmark for General AI Assistants that, if solved, +would represent a milestone in AI research. GAIA proposes real-world questions +that require a set of fundamental abilities such as reasoning, multi-modality +handling, web browsing, and generally tool-use proficiency. GAIA questions are +conceptually simple for humans yet challenging for most advanced AIs: we show +that human respondents obtain 92\% vs. 15\% for GPT-4 equipped with plugins. +This notable performance disparity contrasts with the recent trend of LLMs +outperforming humans on tasks requiring professional skills in e.g. law or +chemistry. GAIA's philosophy departs from the current trend in AI benchmarks +suggesting to target tasks that are ever more difficult for humans. We posit +that the advent of Artificial General Intelligence (AGI) hinges on a system's +capability to exhibit similar robustness as the average human does on such +questions. Using GAIA's methodology, we devise 466 questions and their answer. +We release our questions while retaining answers to 300 of them to power a +leader-board available at https://huggingface.co/gaia-benchmark. + +
+
+
+
+
+ + ♻ ☆ Banach-Tarski Embeddings and Transformers + + +
+ We introduce a new construction of embeddings of arbitrary recursive data +structures into high dimensional vectors. These embeddings provide an +interpretable model for the latent state vectors of transformers. We +demonstrate that these embeddings can be decoded to the original data structure +when the embedding dimension is sufficiently large. This decoding algorithm has +a natural implementation as a transformer. We also show that these embedding +vectors can be manipulated directly to perform computations on the underlying +data without decoding. As an example we present an algorithm that constructs +the embedded parse tree of an embedded token sequence using only vector +operations in embedding space. + +
+
+ comment: 22 pages, 7 figures. v2: Fixed order of matrix multiplication in + section 2.4 +
+
+
+
+
+ + ♻ ☆ Editing Personality for LLMs + + +
+ This paper introduces an innovative task focused on editing the personality +traits of Large Language Models (LLMs). This task seeks to adjust the models' +responses to opinion-related questions on specified topics since an +individual's personality often manifests in the form of their expressed +opinions, thereby showcasing different personality traits. Specifically, we +construct a new benchmark dataset PersonalityEdit to address this task. Drawing +on the theory in Social Psychology, we isolate three representative traits, +namely Neuroticism, Extraversion, and Agreeableness, as the foundation for our +benchmark. We then gather data using GPT-4, generating responses that not only +align with a specified topic but also embody the targeted personality trait. We +conduct comprehensive experiments involving various baselines and discuss the +representation of personality behavior in LLMs. Our intriguing findings uncover +potential challenges of the proposed task, illustrating several remaining +issues. We anticipate that our work can provide the NLP community with +insights. Code and datasets will be released at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Work in progress, add more experiments +
+
+
+
+
+ + ♻ ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models + + +
+ As the cost associated with fine-tuning Large Language Models (LLMs) +continues to rise, recent research efforts have pivoted towards developing +methodologies to edit implicit knowledge embedded within LLMs. Yet, there's +still a dark cloud lingering overhead -- will knowledge editing trigger +butterfly effect? since it is still unclear whether knowledge editing might +introduce side effects that pose potential risks or not. This paper pioneers +the investigation into the potential pitfalls associated with knowledge editing +for LLMs. To achieve this, we introduce new benchmark datasets and propose +innovative evaluation metrics. Our results underline two pivotal concerns: (1) +Knowledge Conflict: Editing groups of facts that logically clash can magnify +the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2) +Knowledge Distortion: Altering parameters with the aim of editing factual +knowledge can irrevocably warp the innate knowledge structure of LLMs. +Experimental results vividly demonstrate that knowledge editing might +inadvertently cast a shadow of unintended consequences on LLMs, which warrant +attention and efforts for future works. Code is available at +https://github.com/zjunlp/PitfallsKnowledgeEditing. + +
+
+ comment: Work in progress, add more experiments +
+
+
+
+
+ + ♻ ☆ Relphormer: Relational Graph Transformer for Knowledge Graph + Representations + + +
+ Transformers have achieved remarkable performance in widespread fields, +including natural language processing, computer vision and graph mining. +However, vanilla Transformer architectures have not yielded promising +improvements in the Knowledge Graph (KG) representations, where the +translational distance paradigm dominates this area. Note that vanilla +Transformer architectures struggle to capture the intrinsically heterogeneous +structural and semantic information of knowledge graphs. To this end, we +propose a new variant of Transformer for knowledge graph representations dubbed +Relphormer. Specifically, we introduce Triple2Seq which can dynamically sample +contextualized sub-graph sequences as the input to alleviate the heterogeneity +issue. We propose a novel structure-enhanced self-attention mechanism to encode +the relational information and keep the semantic information within entities +and relations. Moreover, we utilize masked knowledge modeling for general +knowledge graph representation learning, which can be applied to various +KG-based tasks including knowledge graph completion, question answering, and +recommendation. Experimental results on six datasets show that Relphormer can +obtain better performance compared with baselines. Code is available in +https://github.com/zjunlp/Relphormer. + +
+
+ comment: Neurocomputing 2023 +
+
+
+
+
+ + ♻ ☆ LyricWhiz: Robust Multilingual Zero-shot Lyrics Transcription by + Whispering to ChatGPT + + +
+ We introduce LyricWhiz, a robust, multilingual, and zero-shot automatic +lyrics transcription method achieving state-of-the-art performance on various +lyrics transcription datasets, even in challenging genres such as rock and +metal. Our novel, training-free approach utilizes Whisper, a weakly supervised +robust speech recognition model, and GPT-4, today's most performant chat-based +large language model. In the proposed method, Whisper functions as the "ear" by +transcribing the audio, while GPT-4 serves as the "brain," acting as an +annotator with a strong performance for contextualized output selection and +correction. Our experiments show that LyricWhiz significantly reduces Word +Error Rate compared to existing methods in English and can effectively +transcribe lyrics across multiple languages. Furthermore, we use LyricWhiz to +create the first publicly available, large-scale, multilingual lyrics +transcription dataset with a CC-BY-NC-SA copyright license, based on +MTG-Jamendo, and offer a human-annotated subset for noise level estimation and +evaluation. We anticipate that our proposed method and dataset will advance the +development of multilingual lyrics transcription, a challenging and emerging +task. + +
+
+ comment: 9 pages, 2 figures, 5 tables, accepted by ISMIR 2023 +
+
+
+
+
+ + ♻ ☆ Influencer Videos: Unboxing the Mystique + + +
+ Influencer marketing has become a very popular tool to reach customers. +Despite the rapid growth in influencer videos, there has been little research +on the effectiveness of their constituent features in explaining video +engagement. We study YouTube influencers and analyze their unstructured video +data across text, audio and images using an "interpretable deep learning" +framework that accomplishes both goals of prediction and interpretation. Our +prediction-based approach analyzes unstructured data and finds that "what is +said" in words (text) is more influential than "how it is said" in imagery +(images) or acoustics (audio). Our novel interpretation-based approach is +implemented after completion of model prediction by analyzing the same source +of unstructured data to measure importance attributed to the video features. We +eliminate several spurious relationships in two steps, identifying a subset of +relationships which are confirmed using theory. We uncover novel findings that +establish distinct associations for measures of shallow and deep engagement +based on the dual-system framework of human thinking. Our approach is validated +using simulated data, and we discuss the learnings from our findings for +influencers and brands. + +
+
+ comment: 45 pages, Online Appendix +
+
+
+
+
+ + ♻ ☆ Open Sesame! Universal Black Box Jailbreaking of Large Language Models + + +
+ Large language models (LLMs), designed to provide helpful and safe responses, +often rely on alignment techniques to align with user intent and social +guidelines. Unfortunately, this alignment can be exploited by malicious actors +seeking to manipulate an LLM's outputs for unintended purposes. In this paper +we introduce a novel approach that employs a genetic algorithm (GA) to +manipulate LLMs when model architecture and parameters are inaccessible. The GA +attack works by optimizing a universal adversarial prompt that -- when combined +with a user's query -- disrupts the attacked model's alignment, resulting in +unintended and potentially harmful outputs. Our novel approach systematically +reveals a model's limitations and vulnerabilities by uncovering instances where +its responses deviate from expected behavior. Through extensive experiments we +demonstrate the efficacy of our technique, thus contributing to the ongoing +discussion on responsible AI development by providing a diagnostic tool for +evaluating and enhancing alignment of LLMs with human intent. To our knowledge +this is the first automated universal black box jailbreak attack. + +
+
+
+
+
+ + ♻ ☆ Harnessing the Power of Large Language Models for Empathetic Response + Generation: Empirical Investigations and Improvements EMNLP 2023 + + +
+ Empathetic dialogue is an indispensable part of building harmonious social +relationships and contributes to the development of a helpful AI. Previous +approaches are mainly based on fine small-scale language models. With the +advent of ChatGPT, the application effect of large language models (LLMs) in +this field has attracted great attention. This work empirically investigates +the performance of LLMs in generating empathetic responses and proposes three +improvement methods of semantically similar in-context learning, two-stage +interactive generation, and combination with the knowledge base. Extensive +experiments show that LLMs can significantly benefit from our proposed methods +and is able to achieve state-of-the-art performance in both automatic and human +evaluations. Additionally, we explore the possibility of GPT-4 simulating human +evaluators. + +
+
+ comment: the Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ BigTranslate: Augmenting Large Language Models with Multilingual + Translation Capability over 100 Languages + + +
+ Large language models (LLMs) demonstrate promising translation performance +among various natural languages. However, many LLMs especially the open-sourced +ones, such as BLOOM and LLaMA, are English-dominant and support only dozens of +natural languages, making the potential of LLMs on language translation less +explored. In this work, we present BigTranslate which adapts LLaMA that covers +only 20 languages and enhances it with multilingual translation capability on +more than 100 languages. BigTranslate is built upon LLaMA-13B and it is +optimized in three steps. First, we continue training LLaMA with massive +Chinese monolingual data. Second, we continue training the model with a +large-scale parallel dataset that covers 102 natural languages. Third, we +instruct-tune the foundation model with multilingual translation instructions, +leading to our BigTranslate model. The preliminary experiments on multilingual +translation show that BigTranslate performs comparably with ChatGPT and Google +Translate in many languages and even outperforms ChatGPT in 8 language pairs. +We release the BigTranslate model and hope it can advance the research +progress. + +
+
+ comment: 16 pages, 4 figures. Our model is available at + https://github.com/ZNLP/BigTranslate +
+
+
+
+
+ + ♻ ☆ Unified Segment-to-Segment Framework for Simultaneous Sequence + Generation + + +
+ Simultaneous sequence generation is a pivotal task for real-time scenarios, +such as streaming speech recognition, simultaneous machine translation and +simultaneous speech translation, where the target sequence is generated while +receiving the source sequence. The crux of achieving high-quality generation +with low latency lies in identifying the optimal moments for generating, +accomplished by learning a mapping between the source and target sequences. +However, existing methods often rely on task-specific heuristics for different +sequence types, limiting the model's capacity to adaptively learn the +source-target mapping and hindering the exploration of multi-task learning for +various simultaneous tasks. In this paper, we propose a unified +segment-to-segment framework (Seg2Seg) for simultaneous sequence generation, +which learns the mapping in an adaptive and unified manner. During the process +of simultaneous generation, the model alternates between waiting for a source +segment and generating a target segment, making the segment serve as the +natural bridge between the source and target. To accomplish this, Seg2Seg +introduces a latent segment as the pivot between source to target and explores +all potential source-target mappings via the proposed expectation training, +thereby learning the optimal moments for generating. Experiments on multiple +simultaneous generation tasks demonstrate that Seg2Seg achieves +state-of-the-art performance and exhibits better generality across various +tasks. + +
+
+ comment: Grammatical errors prevent the article from being indexed. This is + not a problem that can be solved by replacing a new version +
+
+
+
+
+ + ♻ ☆ Personas as a Way to Model Truthfulness in Language Models + + +
+ Large Language Models (LLMs) are trained on vast amounts of text from the +internet, which contains both factual and misleading information about the +world. Can language models discern truth from falsehood in this contradicting +data? Expanding on the view that LLMs can model different communicative agents, +we present the persona hypothesis: LLMs can cluster agents into personas using +common features of their generations. For instance, a truthful persona is a +group of agents that are likely to produce truthful text and that share similar +features like formal writing styles and scientific references. By modeling this +persona, LLMs can generalize truthfulness beyond the specific contexts in which +each agent generated the training text. For example, the model can infer that +the agent ``Wikipedia'' will behave truthfully on topics that were only +generated by ``Science'' because they both belong to the truthful persona. We +show evidence for the persona hypothesis via two observations: (1) we can probe +whether a model's answer will be truthful before it is generated; (2) +finetuning a model on a set of facts improves its truthfulness on unseen +topics. Next, using arithmetics as a synthetic environment, we show that +language models can separate true and false statements, and generalize +truthfulness across agents; but only if agents in the training data share a +truthful generative process that enables the creation of a truthful persona. +Overall, our findings suggest that models can exploit hierarchical structures +in the data to learn abstract concepts like truthfulness. + +
+
+
+
+
+ + ♻ ☆ Psychoacoustic Challenges Of Speech Enhancement On VoIP Platforms + + +
+ Within the ambit of VoIP (Voice over Internet Protocol) telecommunications, +the complexities introduced by acoustic transformations merit rigorous +analysis. This research, rooted in the exploration of proprietary sender-side +denoising effects, meticulously evaluates platforms such as Google Meets and +Zoom. The study draws upon the Deep Noise Suppression (DNS) 2020 dataset, +ensuring a structured examination tailored to various denoising settings and +receiver interfaces. A methodological novelty is introduced via the Oaxaca +decomposition, traditionally an econometric tool, repurposed herein to analyze +acoustic-phonetic perturbations within VoIP systems. To further ground the +implications of these transformations, psychoacoustic metrics, specifically +PESQ and STOI, were harnessed to furnish a comprehensive understanding of +speech alterations. Cumulatively, the insights garnered underscore the +intricate landscape of VoIP-influenced acoustic dynamics. In addition to the +primary findings, a multitude of metrics are reported, extending the research +purview. Moreover, out-of-domain benchmarking for both time and time-frequency +domain speech enhancement models is included, thereby enhancing the depth and +applicability of this inquiry. Repository: +github.com/deepology/VoIP-DNS-Challenge + +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis (ABSA): Domains, + Methods, and Trends + + +
+ Aspect-based Sentiment Analysis (ABSA) is a type of fine-grained sentiment +analysis (SA) that identifies aspects and the associated opinions from a given +text. In the digital era, ABSA gained increasing popularity and applications in +mining opinionated text data to obtain insights and support decisions. ABSA +research employs linguistic, statistical, and machine-learning approaches and +utilises resources such as labelled datasets, aspect and sentiment lexicons and +ontology. By its nature, ABSA is domain-dependent and can be sensitive to the +impact of misalignment between the resource and application domains. However, +to our knowledge, this topic has not been explored by the existing ABSA +literature reviews. In this paper, we present a Systematic Literature Review +(SLR) of ABSA studies with a focus on the research application domain, dataset +domain, and the research methods to examine their relationships and identify +trends over time. Our results suggest a number of potential systemic issues in +the ABSA research literature, including the predominance of the +``product/service review'' dataset domain among the majority of studies that +did not have a specific research application domain, coupled with the +prevalence of dataset-reliant methods such as supervised machine learning. This +review makes a number of unique contributions to the ABSA research field: 1) To +our knowledge, it is the first SLR that links the research domain, dataset +domain, and research method through a systematic perspective; 2) it is one of +the largest scoped SLR on ABSA, with 519 eligible studies filtered from 4191 +search results without time constraint; and 3) our review methodology adopted +an innovative automatic filtering process based on PDF-mining, which enhanced +screening quality and reliability. Suggestions and our review limitations are +also discussed. + +
+
+
+
+
+ + ♻ ☆ Exponentially Faster Language Modelling + + +
+ Language models only really need to use an exponential fraction of their +neurons for individual inferences. As proof, we present UltraFastBERT, a BERT +variant that uses 0.3% of its neurons during inference while performing on par +with similar BERT models. UltraFastBERT selectively engages just 12 out of 4095 +neurons for each layer inference. This is achieved by replacing feedforward +networks with fast feedforward networks (FFFs). While no truly efficient +implementation currently exists to unlock the full acceleration potential of +conditional neural execution, we provide high-level CPU code achieving 78x +speedup over the optimized baseline feedforward implementation, and a PyTorch +implementation delivering 40x speedup over the equivalent batched feedforward +inference. We publish our training code, benchmarking setup, and model weights. + +
+
+
+
+
+ + ♻ ☆ Efficient Streaming Language Models with Attention Sinks + + +
+ Deploying Large Language Models (LLMs) in streaming applications such as +multi-round dialogue, where long interactions are expected, is urgently needed +but poses two major challenges. Firstly, during the decoding stage, caching +previous tokens' Key and Value states (KV) consumes extensive memory. Secondly, +popular LLMs cannot generalize to longer texts than the training sequence +length. Window attention, where only the most recent KVs are cached, is a +natural approach -- but we show that it fails when the text length surpasses +the cache size. We observe an interesting phenomenon, namely attention sink, +that keeping the KV of initial tokens will largely recover the performance of +window attention. In this paper, we first demonstrate that the emergence of +attention sink is due to the strong attention scores towards initial tokens as +a ``sink'' even if they are not semantically important. Based on the above +analysis, we introduce StreamingLLM, an efficient framework that enables LLMs +trained with a finite length attention window to generalize to infinite +sequence lengths without any fine-tuning. We show that StreamingLLM can enable +Llama-2, MPT, Falcon, and Pythia to perform stable and efficient language +modeling with up to 4 million tokens and more. In addition, we discover that +adding a placeholder token as a dedicated attention sink during pre-training +can further improve streaming deployment. In streaming settings, StreamingLLM +outperforms the sliding window recomputation baseline by up to 22.2x speedup. +Code and datasets are provided at https://github.com/mit-han-lab/streaming-llm. + +
+
+
+
+
+ + ♻ ☆ The Short Text Matching Model Enhanced with Knowledge via Contrastive + Learning + + +
+ In recent years, short Text Matching tasks have been widely applied in the +fields ofadvertising search and recommendation. The difficulty lies in the lack +of semantic information and word ambiguity caused by the short length of the +text. Previous works have introduced complement sentences or knowledge bases to +provide additional feature information. However, these methods have not fully +interacted between the original sentence and the complement sentence, and have +not considered the noise issue that may arise from the introduction of external +knowledge bases. Therefore, this paper proposes a short Text Matching model +that combines contrastive learning and external knowledge. The model uses a +generative model to generate corresponding complement sentences and uses the +contrastive learning method to guide the model to obtain more semantically +meaningful encoding of the original sentence. In addition, to avoid noise, we +use keywords as the main semantics of the original sentence to retrieve +corresponding knowledge words in the knowledge base, and construct a knowledge +graph. The graph encoding model is used to integrate the knowledge base +information into the model. Our designed model achieves state-of-the-art +performance on two publicly available Chinese Text Matching datasets, +demonstrating the effectiveness of our model. + +
+
+ comment: 11 pages,2 figures +
+
+
+
+
+ + ♻ ☆ Extraction and Summarization of Explicit Video Content using Multi-Modal + Deep Learning + + +
+ With the increase in video-sharing platforms across the internet, it is +difficult for humans to moderate the data for explicit content. Hence, an +automated pipeline to scan through video data for explicit content has become +the need of the hour. We propose a novel pipeline that uses multi-modal deep +learning to first extract the explicit segments of input videos and then +summarize their content using text to determine its age appropriateness and age +rating. We also evaluate our pipeline's effectiveness in the end using standard +metrics. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ DoReMi: Optimizing Data Mixtures Speeds Up Language Model Pretraining NeurIPS 2023 + + +
+ The mixture proportions of pretraining data domains (e.g., Wikipedia, books, +web text) greatly affect language model (LM) performance. In this paper, we +propose Domain Reweighting with Minimax Optimization (DoReMi), which first +trains a small proxy model using group distributionally robust optimization +(Group DRO) over domains to produce domain weights (mixture proportions) +without knowledge of downstream tasks. We then resample a dataset with these +domain weights and train a larger, full-sized model. In our experiments, we use +DoReMi on a 280M-parameter proxy model to set the domain weights for training +an 8B-parameter model (30x larger) more efficiently. On The Pile, DoReMi +improves perplexity across all domains, even when it downweights a domain. +DoReMi improves average few-shot downstream accuracy by 6.5% points over a +baseline model trained using The Pile's default domain weights and reaches the +baseline accuracy with 2.6x fewer training steps. On the GLaM dataset, DoReMi, +which has no knowledge of downstream tasks, even matches the performance of +using domain weights tuned on downstream tasks. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Pragmatics in Language Grounding: Phenomena, Tasks, and Modeling + Approaches EMNLP 2023 + + +
+ People rely heavily on context to enrich meaning beyond what is literally +said, enabling concise but effective communication. To interact successfully +and naturally with people, user-facing artificial intelligence systems will +require similar skills in pragmatics: relying on various types of context -- +from shared linguistic goals and conventions, to the visual and embodied world +-- to use language effectively. We survey existing grounded settings and +pragmatic modeling approaches and analyze how the task goals, environmental +contexts, and communicative affordances in each work enrich linguistic meaning. +We present recommendations for future grounded task design to naturally elicit +pragmatic phenomena, and suggest directions that focus on a broader range of +communicative contexts and affordances. + +
+
+ comment: Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Representation Projection Invariance Mitigates Representation Collapse + + +
+ Fine-tuning contextualized representations learned by pre-trained language +models remains a prevalent practice in NLP. However, fine-tuning can lead to +representation degradation (also known as representation collapse), which may +result in instability, sub-optimal performance, and weak generalization. + In this paper, we propose Representation Projection Invariance (REPINA), a +novel regularization method to maintain the information content of +representation and reduce representation collapse during fine-tuning by +discouraging undesirable changes in the representations. We study the empirical +behavior of the proposed regularization in comparison to 5 comparable baselines +across 13 language understanding tasks (GLUE benchmark and six additional +datasets). When evaluating in-domain performance, REPINA consistently +outperforms other baselines on most tasks (10 out of 13). We also demonstrate +its effectiveness in few-shot settings and robustness to label perturbation. As +a by-product, we extend previous studies of representation collapse and propose +several metrics to quantify it. Our empirical findings show that our approach +is significantly more effective at mitigating representation collapse. + +
+
+ comment: 41 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Pre-training Language Models for Comparative Reasoning EMNLP 2023 + + +
+ Comparative reasoning is a process of comparing objects, concepts, or +entities to draw conclusions, which constitutes a fundamental cognitive +ability. In this paper, we propose a novel framework to pre-train language +models for enhancing their abilities of comparative reasoning over texts. While +there have been approaches for NLP tasks that require comparative reasoning, +they suffer from costly manual data labeling and limited generalizability to +different tasks. Our approach introduces a novel method of collecting scalable +data for text-based entity comparison, which leverages both structured and +unstructured data. Moreover, we present a framework of pre-training language +models via three novel objectives on comparative reasoning. Evaluation on +downstream tasks including comparative question answering, question generation, +and summarization shows that our pre-training framework significantly improves +the comparative reasoning abilities of language models, especially under +low-resource conditions. This work also releases the first integrated benchmark +for comparative reasoning. + +
+
+ comment: EMNLP 2023 - Camera Ready +
+
+
+
+
+ + ♻ ☆ Merging Experts into One: Improving Computational Efficiency of Mixture + of Experts EMNLP 2023 + + +
+ Scaling the size of language models usually leads to remarkable advancements +in NLP tasks. But it often comes with a price of growing computational cost. +Although a sparse Mixture of Experts (MoE) can reduce the cost by activating a +small subset of parameters (e.g., one expert) for each input, its computation +escalates significantly if increasing the number of activated experts, limiting +its practical utility. Can we retain the advantages of adding more experts +without substantially increasing the computational costs? In this paper, we +first demonstrate the superiority of selecting multiple experts and then +propose a computation-efficient approach called \textbf{\texttt{Merging Experts +into One}} (MEO), which reduces the computation cost to that of a single +expert. Extensive experiments show that MEO significantly improves +computational efficiency, e.g., FLOPS drops from 72.0G of vanilla MoE to 28.6G +(MEO). Moreover, we propose a token-level attention block that further enhances +the efficiency and performance of token-level MEO, e.g., 83.3\% (MEO) vs. +82.6\% (vanilla MoE) average score on the GLUE benchmark. Our code will be +released upon acceptance. Code will be released at: +\url{https://github.com/Shwai-He/MEO}. + +
+
+ comment: EMNLP 2023 Main Conference (Oral) +
+
+
+
+
+ + ♻ ☆ Persian Typographical Error Type Detection Using Deep Neural Networks on + Algorithmically-Generated Misspellings + + +
+ Spelling correction is a remarkable challenge in the field of natural +language processing. The objective of spelling correction tasks is to recognize +and rectify spelling errors automatically. The development of applications that +can effectually diagnose and correct Persian spelling and grammatical errors +has become more important in order to improve the quality of Persian text. The +Typographical Error Type Detection in Persian is a relatively understudied +area. Therefore, this paper presents a compelling approach for detecting +typographical errors in Persian texts. Our work includes the presentation of a +publicly available dataset called FarsTypo, which comprises 3.4 million words +arranged in chronological order and tagged with their corresponding +part-of-speech. These words cover a wide range of topics and linguistic styles. +We develop an algorithm designed to apply Persian-specific errors to a scalable +portion of these words, resulting in a parallel dataset of correct and +incorrect words. By leveraging FarsTypo, we establish a strong foundation and +conduct a thorough comparison of various methodologies employing different +architectures. Additionally, we introduce a groundbreaking Deep Sequential +Neural Network that utilizes both word and character embeddings, along with +bidirectional LSTM layers, for token classification aimed at detecting +typographical errors across 51 distinct classes. Our approach is contrasted +with highly advanced industrial systems that, unlike this study, have been +developed using a diverse range of resources. The outcomes of our final method +proved to be highly competitive, achieving an accuracy of 97.62%, precision of +98.83%, recall of 98.61%, and surpassing others in terms of speed. + +
+
+
+
+
+ + ♻ ☆ A Language Agent for Autonomous Driving + + +
+ Human-level driving is an ultimate goal of autonomous driving. Conventional +approaches formulate autonomous driving as a perception-prediction-planning +framework, yet their systems do not capitalize on the inherent reasoning +ability and experiential knowledge of humans. In this paper, we propose a +fundamental paradigm shift from current pipelines, exploiting Large Language +Models (LLMs) as a cognitive agent to integrate human-like intelligence into +autonomous driving systems. Our approach, termed Agent-Driver, transforms the +traditional autonomous driving pipeline by introducing a versatile tool library +accessible via function calls, a cognitive memory of common sense and +experiential knowledge for decision-making, and a reasoning engine capable of +chain-of-thought reasoning, task planning, motion planning, and +self-reflection. Powered by LLMs, our Agent-Driver is endowed with intuitive +common sense and robust reasoning capabilities, thus enabling a more nuanced, +human-like approach to autonomous driving. We evaluate our approach on the +large-scale nuScenes benchmark, and extensive experiments substantiate that our +Agent-Driver significantly outperforms the state-of-the-art driving methods by +a large margin. Our approach also demonstrates superior interpretability and +few-shot learning ability to these methods. Code will be released. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 145 + +
+
+
+ + ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ☆ ShareGPT4V: Improving Large Multi-Modal Models with Better Captions + + +
+ In the realm of large multi-modal models (LMMs), efficient modality alignment +is crucial yet often constrained by the scarcity of high-quality image-text +data. To address this bottleneck, we introduce the ShareGPT4V dataset, a +pioneering large-scale resource featuring 1.2 million highly descriptive +captions, which surpasses existing datasets in diversity and information +content, covering world knowledge, object properties, spatial relationships, +and aesthetic evaluations. Specifically, ShareGPT4V originates from a curated +100K high-quality captions collected from advanced GPT4-Vision and has been +expanded to 1.2M with a superb caption model trained on this subset. ShareGPT4V +first demonstrates its effectiveness for the Supervised Fine-Tuning (SFT) +phase, by substituting an equivalent quantity of detailed captions in existing +SFT datasets with a subset of our high-quality captions, significantly +enhancing the LMMs like LLaVA-7B, LLaVA-1.5-13B, and Qwen-VL-Chat-7B on the MME +and MMBench benchmarks, with respective gains of 222.8/22.0/22.3 and +2.7/1.3/1.5. We further incorporate ShareGPT4V data into both the pre-training +and SFT phases, obtaining ShareGPT4V-7B, a superior LMM based on a simple +architecture that has remarkable performance across a majority of the +multi-modal benchmarks. This project is available at +https://ShareGPT4V.github.io to serve as a pivotal resource for advancing the +LMMs community. + +
+
+
+
+
+ + ☆ Intrinsic Image Decomposition via Ordinal Shading + + +
+ Intrinsic decomposition is a fundamental mid-level vision problem that plays +a crucial role in various inverse rendering and computational photography +pipelines. Generating highly accurate intrinsic decompositions is an inherently +under-constrained task that requires precisely estimating continuous-valued +shading and albedo. In this work, we achieve high-resolution intrinsic +decomposition by breaking the problem into two parts. First, we present a dense +ordinal shading formulation using a shift- and scale-invariant loss in order to +estimate ordinal shading cues without restricting the predictions to obey the +intrinsic model. We then combine low- and high-resolution ordinal estimations +using a second network to generate a shading estimate with both global +coherency and local details. We encourage the model to learn an accurate +decomposition by computing losses on the estimated shading as well as the +albedo implied by the intrinsic model. We develop a straightforward method for +generating dense pseudo ground truth using our model's predictions and +multi-illumination data, enabling generalization to in-the-wild imagery. We +present an exhaustive qualitative and quantitative analysis of our predicted +intrinsic components against state-of-the-art methods. Finally, we demonstrate +the real-world applicability of our estimations by performing otherwise +difficult editing tasks such as recoloring and relighting. + +
+
+ comment: 24 pages, 23 figures, Accepted to ACM Transactions on Graphics + (2023). Project page: https://yaksoy.github.io/intrinsic/ +
+
+
+
+
+ + ☆ SuGaR: Surface-Aligned Gaussian Splatting for Efficient 3D Mesh + Reconstruction and High-Quality Mesh Rendering + + +
+ We propose a method to allow precise and extremely fast mesh extraction from +3D Gaussian Splatting. Gaussian Splatting has recently become very popular as +it yields realistic rendering while being significantly faster to train than +NeRFs. It is however challenging to extract a mesh from the millions of tiny 3D +gaussians as these gaussians tend to be unorganized after optimization and no +method has been proposed so far. Our first key contribution is a regularization +term that encourages the gaussians to align well with the surface of the scene. +We then introduce a method that exploits this alignment to extract a mesh from +the Gaussians using Poisson reconstruction, which is fast, scalable, and +preserves details, in contrast to the Marching Cubes algorithm usually applied +to extract meshes from Neural SDFs. Finally, we introduce an optional +refinement strategy that binds gaussians to the surface of the mesh, and +jointly optimizes these Gaussians and the mesh through Gaussian splatting +rendering. This enables easy editing, sculpting, rigging, animating, +compositing and relighting of the Gaussians using traditional softwares by +manipulating the mesh instead of the gaussians themselves. Retrieving such an +editable mesh for realistic rendering is done within minutes with our method, +compared to hours with the state-of-the-art methods on neural SDFs, while +providing a better rendering quality. + +
+
+ comment: Project Webpage: https://imagine.enpc.fr/~guedona/sugar/ +
+
+
+
+
+ + ☆ Iris Presentation Attack: Assessing the Impact of Combining Vanadium + Dioxide Films with Artificial Eyes + + +
+ Iris recognition systems, operating in the near infrared spectrum (NIR), have +demonstrated vulnerability to presentation attacks, where an adversary uses +artifacts such as cosmetic contact lenses, artificial eyes or printed iris +images in order to circumvent the system. At the same time, a number of +effective presentation attack detection (PAD) methods have been developed. +These methods have demonstrated success in detecting artificial eyes (e.g., +fake Van Dyke eyes) as presentation attacks. In this work, we seek to alter the +optical characteristics of artificial eyes by affixing Vanadium Dioxide (VO2) +films on their surface in various spatial configurations. VO2 films can be used +to selectively transmit NIR light and can, therefore, be used to regulate the +amount of NIR light from the object that is captured by the iris sensor. We +study the impact of such images produced by the sensor on two state-of-the-art +iris PA detection methods. We observe that the addition of VO2 films on the +surface of artificial eyes can cause the PA detection methods to misclassify +them as bonafide eyes in some cases. This represents a vulnerability that must +be systematically analyzed and effectively addressed. + +
+
+
+
+
+ + ☆ Swift Parameter-free Attention Network for Efficient Super-Resolution + + +
+ Single Image Super-Resolution (SISR) is a crucial task in low-level computer +vision, aiming to reconstruct high-resolution images from low-resolution +counterparts. Conventional attention mechanisms have significantly improved +SISR performance but often result in complex network structures and large +number of parameters, leading to slow inference speed and large model size. To +address this issue, we propose the Swift Parameter-free Attention Network +(SPAN), a highly efficient SISR model that balances parameter count, inference +speed, and image quality. SPAN employs a novel parameter-free attention +mechanism, which leverages symmetric activation functions and residual +connections to enhance high-contribution information and suppress redundant +information. Our theoretical analysis demonstrates the effectiveness of this +design in achieving the attention mechanism's purpose. We evaluate SPAN on +multiple benchmarks, showing that it outperforms existing efficient +super-resolution models in terms of both image quality and inference speed, +achieving a significant quality-speed trade-off. This makes SPAN highly +suitable for real-world applications, particularly in resource-constrained +scenarios. Notably, our model attains the best PSNR of 27.09 dB, and the test +runtime of our team is reduced by 7.08ms in the NTIRE 2023 efficient +super-resolution challenge. Our code and models are made publicly available at +\url{https://github.com/hongyuanyu/SPAN}. + +
+
+
+
+
+ + ☆ Investigating Weight-Perturbed Deep Neural Networks With Application in + Iris Presentation Attack Detection + + +
+ Deep neural networks (DNNs) exhibit superior performance in various machine +learning tasks, e.g., image classification, speech recognition, biometric +recognition, object detection, etc. However, it is essential to analyze their +sensitivity to parameter perturbations before deploying them in real-world +applications. In this work, we assess the sensitivity of DNNs against +perturbations to their weight and bias parameters. The sensitivity analysis +involves three DNN architectures (VGG, ResNet, and DenseNet), three types of +parameter perturbations (Gaussian noise, weight zeroing, and weight scaling), +and two settings (entire network and layer-wise). We perform experiments in the +context of iris presentation attack detection and evaluate on two publicly +available datasets: LivDet-Iris-2017 and LivDet-Iris-2020. Based on the +sensitivity analysis, we propose improved models simply by perturbing +parameters of the network without undergoing training. We further combine these +perturbed models at the score-level and at the parameter-level to improve the +performance over the original model. The ensemble at the parameter-level shows +an average improvement of 43.58% on the LivDet-Iris-2017 dataset and 9.25% on +the LivDet-Iris-2020 dataset. The source code is available at +\href{https://github.com/redwankarimsony/WeightPerturbation-MSU}{https://github.com/redwankarimsony/WeightPerturbation-MSU}. + +
+
+
+
+
+ + ☆ High-resolution Image-based Malware Classification using Multiple + Instance Learning + + +
+ This paper proposes a novel method of classifying malware into families using +high-resolution greyscale images and multiple instance learning to overcome +adversarial binary enlargement. Current methods of visualisation-based malware +classification largely rely on lossy transformations of inputs such as resizing +to handle the large, variable-sized images. Through empirical analysis and +experimentation, it is shown that these approaches cause crucial information +loss that can be exploited. The proposed solution divides the images into +patches and uses embedding-based multiple instance learning with a +convolutional neural network and an attention aggregation function for +classification. The implementation is evaluated on the Microsoft Malware +Classification dataset and achieves accuracies of up to $96.6\%$ on +adversarially enlarged samples compared to the baseline of $22.8\%$. The Python +code is available online at https://github.com/timppeters/MIL-Malware-Images . + +
+
+ comment: 14 pages, 13 figures, 2 tables +
+
+
+
+
+ + ☆ SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction + + +
+ 3D occupancy prediction is an important task for the robustness of +vision-centric autonomous driving, which aims to predict whether each point is +occupied in the surrounding 3D space. Existing methods usually require 3D +occupancy labels to produce meaningful results. However, it is very laborious +to annotate the occupancy status of each voxel. In this paper, we propose +SelfOcc to explore a self-supervised way to learn 3D occupancy using only video +sequences. We first transform the images into the 3D space (e.g., bird's eye +view) to obtain 3D representation of the scene. We directly impose constraints +on the 3D representations by treating them as signed distance fields. We can +then render 2D images of previous and future frames as self-supervision signals +to learn the 3D representations. We propose an MVS-embedded strategy to +directly optimize the SDF-induced weights with multiple depth proposals. Our +SelfOcc outperforms the previous best method SceneRF by 58.7% using a single +frame as input on SemanticKITTI and is the first self-supervised work that +produces reasonable 3D occupancy for surround cameras on Occ3D. SelfOcc +produces high-quality depth and achieves state-of-the-art results on novel +depth synthesis, monocular depth estimation, and surround-view depth estimation +on the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code: +https://github.com/huang-yh/SelfOcc. + +
+
+ comment: Code is available at: https://github.com/huang-yh/SelfOcc +
+
+
+
+
+ + ☆ Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with + Spatially Relation Matching + + +
+ Drone navigation through natural language commands remains a significant +challenge due to the lack of publicly available multi-modal datasets and the +intricate demands of fine-grained visual-text alignment. In response to this +pressing need, we present a new human-computer interaction annotation benchmark +called GeoText-1652, meticulously curated through a robust Large Language Model +(LLM)-based data generation framework and the expertise of pre-trained vision +models. This new dataset seamlessly extends the existing image dataset, \ie, +University-1652, with spatial-aware text annotations, encompassing intricate +image-text-bounding box associations. Besides, we introduce a new optimization +objective to leverage fine-grained spatial associations, called blending +spatial matching, for region-level spatial relation matching. Extensive +experiments reveal that our approach maintains an exceptional recall rate under +varying description complexities. This underscores the promising potential of +our approach in elevating drone control and navigation through the seamless +integration of natural language commands in real-world scenarios. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ Attacking Motion Planners Using Adversarial Perception Errors + + +
+ Autonomous driving (AD) systems are often built and tested in a modular +fashion, where the performance of different modules is measured using +task-specific metrics. These metrics should be chosen so as to capture the +downstream impact of each module and the performance of the system as a whole. +For example, high perception quality should enable prediction and planning to +be performed safely. Even though this is true in general, we show here that it +is possible to construct planner inputs that score very highly on various +perception quality metrics but still lead to planning failures. In an analogy +to adversarial attacks on image classifiers, we call such inputs +\textbf{adversarial perception errors} and show they can be systematically +constructed using a simple boundary-attack algorithm. We demonstrate the +effectiveness of this algorithm by finding attacks for two different black-box +planners in several urban and highway driving scenarios using the CARLA +simulator. Finally, we analyse the properties of these attacks and show that +they are isolated in the input space of the planner, and discuss their +implications for AD system deployment and testing. + +
+
+
+
+
+ + ☆ Cascade Learning Localises Discriminant Features in Visual Scene + Classification + + +
+ Lack of interpretability of deep convolutional neural networks (DCNN) is a +well-known problem particularly in the medical domain as clinicians want +trustworthy automated decisions. One way to improve trust is to demonstrate the +localisation of feature representations with respect to expert labeled regions +of interest. In this work, we investigate the localisation of features learned +via two varied learning paradigms and demonstrate the superiority of one +learning approach with respect to localisation. Our analysis on medical and +natural datasets show that the traditional end-to-end (E2E) learning strategy +has a limited ability to localise discriminative features across multiple +network layers. We show that a layer-wise learning strategy, namely cascade +learning (CL), results in more localised features. Considering localisation +accuracy, we not only show that CL outperforms E2E but that it is a promising +method of predicting regions. On the YOLO object detection framework, our best +result shows that CL outperforms the E2E scheme by $2\%$ in mAP. + +
+
+
+
+
+ + ☆ Transferring to Real-World Layouts: A Depth-aware Framework for Scene + Adaptation + + +
+ Scene segmentation via unsupervised domain adaptation (UDA) enables the +transfer of knowledge acquired from source synthetic data to real-world target +data, which largely reduces the need for manual pixel-level annotations in the +target domain. To facilitate domain-invariant feature learning, existing +methods typically mix data from both the source domain and target domain by +simply copying and pasting the pixels. Such vanilla methods are usually +sub-optimal since they do not take into account how well the mixed layouts +correspond to real-world scenarios. Real-world scenarios are with an inherent +layout. We observe that semantic categories, such as sidewalks, buildings, and +sky, display relatively consistent depth distributions, and could be clearly +distinguished in a depth map. Based on such observation, we propose a +depth-aware framework to explicitly leverage depth estimation to mix the +categories and facilitate the two complementary tasks, i.e., segmentation and +depth learning in an end-to-end manner. In particular, the framework contains a +Depth-guided Contextual Filter (DCF) forndata augmentation and a cross-task +encoder for contextual learning. DCF simulates the real-world layouts, while +the cross-task encoder further adaptively fuses the complementing features +between two tasks. Besides, it is worth noting that several public datasets do +not provide depth annotation. Therefore, we leverage the off-the-shelf depth +estimation network to generate the pseudo depth. Extensive experiments show +that our proposed methods, even with pseudo depth, achieve competitive +performance on two widely-used bench-marks, i.e. 77.7 mIoU on GTA to Cityscapes +and 69.3 mIoU on Synthia to Cityscapes. + +
+
+
+
+
+ + ☆ BundleMoCap: Efficient, Robust and Smooth Motion Capture from Sparse + Multiview Videos + + +
+ Capturing smooth motions from videos using markerless techniques typically +involves complex processes such as temporal constraints, multiple stages with +data-driven regression and optimization, and bundle solving over temporal +windows. These processes can be inefficient and require tuning multiple +objectives across stages. In contrast, BundleMoCap introduces a novel and +efficient approach to this problem. It solves the motion capture task in a +single stage, eliminating the need for temporal smoothness objectives while +still delivering smooth motions. BundleMoCap outperforms the state-of-the-art +without increasing complexity. The key concept behind BundleMoCap is manifold +interpolation between latent keyframes. By relying on a local manifold +smoothness assumption, we can efficiently solve a bundle of frames using a +single code. Additionally, the method can be implemented as a sliding window +optimization and requires only the first frame to be properly initialized, +reducing the overall computational burden. BundleMoCap's strength lies in its +ability to achieve high-quality motion capture results with simplicity and +efficiency. More details can be found at https://moverseai.github.io/bundle/. + +
+
+ comment: Published in European Conference on Visual Media Production (CVMP + '23) +
+
+
+
+
+ + ☆ Similar Document Template Matching Algorithm + + +
+ This study outlines a comprehensive methodology for verifying medical +documents, integrating advanced techniques in template extraction, comparison, +and fraud detection. It begins with template extraction using sophisticated +region-of-interest (ROI) methods, incorporating contour analysis and edge +identification. Pre-processing steps ensure template clarity through +morphological operations and adaptive thresholding. The template comparison +algorithm utilizes advanced feature matching with key points and descriptors, +enhancing robustness through histogram-based analysis for accounting +variations. Fraud detection involves the SSIM computation and OCR for textual +information extraction. The SSIM quantifies structural similarity, aiding in +potential match identification. OCR focuses on critical areas like patient +details, provider information, and billing amounts. Extracted information is +compared with a reference dataset, and confidence thresholding ensures reliable +fraud detection. Adaptive parameters enhance system flexibility for dynamic +adjustments to varying document layouts. This methodology provides a robust +approach to medical document verification, addressing complexities in template +extraction, comparison, fraud detection, and adaptability to diverse document +structures. + +
+
+ comment: 8 pages,8 figures +
+
+
+
+
+ + ☆ Visually Guided Object Grasping + + +
+ In this paper we present a visual servoing approach to the problem of object +grasping and more generally, to the problem of aligning an end-effector with an +object. First we extend the method proposed by Espiau et al. [1] to the case of +a camera which is not mounted onto the robot being controlled and we stress the +importance of the real-time estimation of the image Jacobian. Second, we show +how to represent a grasp or more generally, an alignment between two solids in +3-D projective space using an uncalibrated stereo rig. Such a 3-D projective +representation is view-invariant in the sense that it can be easily mapped into +an image set-point without any knowledge about the camera parameters. Third, we +perform an analysis of the performances of the visual servoing algorithm and of +the grasping precision that can be expected from this type of approach. + +
+
+
+
+
+ + ☆ Hand-Eye Calibration + + +
+ Whenever a sensor is mounted on a robot hand it is important to know the +relationship between the sensor and the hand. The problem of determining this +relationship is referred to as hand-eye calibration, which is important in at +least two types of tasks: (i) map sensor centered measurements into the robot +workspace and (ii) allow the robot to precisely move the sensor. In the past +some solutions were proposed in the particular case of a camera. With almost no +exception, all existing solutions attempt to solve the homogeneous matrix +equation AX=XB. First we show that there are two possible formulations of the +hand-eye calibration problem. One formulation is the classical one that we just +mentioned. A second formulation takes the form of the following homogeneous +matrix equation: MY=M'YB. The advantage of the latter is that the extrinsic and +intrinsic camera parameters need not be made explicit. Indeed, this formulation +directly uses the 3 by 4 perspective matrices (M and M') associated with two +positions of the camera. Moreover, this formulation together with the classical +one cover a wider range of camera-based sensors to be calibrated with respect +to the robot hand. Second, we develop a common mathematical framework to solve +for the hand-eye calibration problem using either of the two formulations. We +present two methods, (i) a rotation then translation and (ii) a non-linear +solver for rotation and translation. Third, we perform a stability analysis +both for our two methods and for the classical linear method developed. In the +light of this comparison, the non-linear optimization method, that solves for +rotation and translation simultaneously, seems to be the most robust one with +respect to noise and to measurement errors. + +
+
+
+
+
+ + ☆ Mobile-Seed: Joint Semantic Segmentation and Boundary Detection for + Mobile Robots + + +
+ Precise and rapid delineation of sharp boundaries and robust semantics is +essential for numerous downstream robotic tasks, such as robot grasping and +manipulation, real-time semantic mapping, and online sensor calibration +performed on edge computing units. Although boundary detection and semantic +segmentation are complementary tasks, most studies focus on lightweight models +for semantic segmentation but overlook the critical role of boundary detection. +In this work, we introduce Mobile-Seed, a lightweight, dual-task framework +tailored for simultaneous semantic segmentation and boundary detection. Our +framework features a two-stream encoder, an active fusion decoder (AFD) and a +dual-task regularization approach. The encoder is divided into two pathways: +one captures category-aware semantic information, while the other discerns +boundaries from multi-scale features. The AFD module dynamically adapts the +fusion of semantic and boundary information by learning channel-wise +relationships, allowing for precise weight assignment of each channel. +Furthermore, we introduce a regularization loss to mitigate the conflicts in +dual-task learning and deep diversity supervision. Compared to existing +methods, the proposed Mobile-Seed offers a lightweight framework to +simultaneously improve semantic segmentation performance and accurately locate +object boundaries. Experiments on the Cityscapes dataset have shown that +Mobile-Seed achieves notable improvement over the state-of-the-art (SOTA) +baseline by 2.2 percentage points (pp) in mIoU and 4.2 pp in mF-score, while +maintaining an online inference speed of 23.9 frames-per-second (FPS) with +1024x2048 resolution input on an RTX 2080 Ti GPU. Additional experiments on +CamVid and PASCAL Context datasets confirm our method's generalizability. Code +and additional results are publicly available at +\url{https://martin-liao.github.io/Mobile-Seed/}. + +
+
+ comment: 8 pages, IEEE conference/letter underreview. Code and additional + results are available at: \url{https://martin-liao.github.io/Mobile-Seed/} +
+
+
+
+
+ + ☆ Polyhedral Object Recognition by Indexing + + +
+ In computer vision, the indexing problem is the problem of recognizing a few +objects in a large database of objects while avoiding the help of the classical +image-feature-to-object-feature matching paradigm. In this paper we address the +problem of recognizing 3-D polyhedral objects from 2-D images by indexing. Both +the objects to be recognized and the images are represented by weighted graphs. +The indexing problem is therefore the problem of determining whether a graph +extracted from the image is present or absent in a database of model graphs. We +introduce a novel method for performing this graph indexing process which is +based both on polynomial characterization of binary and weighted graphs and on +hashing. We describe in detail this polynomial characterization and then we +show how it can be used in the context of polyhedral object recognition. Next +we describe a practical recognition-by-indexing system that includes the +organization of the database, the representation of polyhedral objects in terms +of 2-D characteristic views, the representation of this views in terms of +weighted graphs, and the associated image processing. Finally, some +experimental results allow the evaluation of the system performance. + +
+
+
+
+
+ + ☆ KNVQA: A Benchmark for evaluation knowledge-based VQA + + +
+ Within the multimodal field, large vision-language models (LVLMs) have made +significant progress due to their strong perception and reasoning capabilities +in the visual and language systems. However, LVLMs are still plagued by the two +critical issues of object hallucination and factual accuracy, which limit the +practicality of LVLMs in different scenarios. Furthermore, previous evaluation +methods focus more on the comprehension and reasoning of language content but +lack a comprehensive evaluation of multimodal interactions, thereby resulting +in potential limitations. To this end, we propose a novel KNVQA-Eval, which is +devoted to knowledge-based VQA task evaluation to reflect the factuality of +multimodal LVLMs. To ensure the robustness and scalability of the evaluation, +we develop a new KNVQA dataset by incorporating human judgment and perception, +aiming to evaluate the accuracy of standard answers relative to AI-generated +answers in knowledge-based VQA. This work not only comprehensively evaluates +the contextual information of LVLMs using reliable human annotations, but also +further analyzes the fine-grained capabilities of current methods to reveal +potential avenues for subsequent optimization of LVLMs-based estimators. Our +proposed VQA-Eval and corresponding dataset KNVQA will facilitate the +development of automatic evaluation tools with the advantages of low cost, +privacy protection, and reproducibility. Our code will be released upon +publication. + +
+
+
+
+
+ + ☆ GPT4Motion: Scripting Physical Motions in Text-to-Video Generation via + Blender-Oriented GPT Planning + + +
+ Recent advances in text-to-video generation have harnessed the power of +diffusion models to create visually compelling content conditioned on text +prompts. However, they usually encounter high computational costs and often +struggle to produce videos with coherent physical motions. To tackle these +issues, we propose GPT4Motion, a training-free framework that leverages the +planning capability of large language models such as GPT, the physical +simulation strength of Blender, and the excellent image generation ability of +text-to-image diffusion models to enhance the quality of video synthesis. +Specifically, GPT4Motion employs GPT-4 to generate a Blender script based on a +user textual prompt, which commands Blender's built-in physics engine to craft +fundamental scene components that encapsulate coherent physical motions across +frames. Then these components are inputted into Stable Diffusion to generate a +video aligned with the textual prompt. Experimental results on three basic +physical motion scenarios, including rigid object drop and collision, cloth +draping and swinging, and liquid flow, demonstrate that GPT4Motion can generate +high-quality videos efficiently in maintaining motion coherency and entity +consistency. GPT4Motion offers new insights in text-to-video research, +enhancing its quality and broadening its horizon for future explorations. + +
+
+
+
+
+ + ☆ Bridging Generalization Gaps in High Content Imaging Through Online + Self-Supervised Domain Adaptation WACV + 2024 + + +
+ High Content Imaging (HCI) plays a vital role in modern drug discovery and +development pipelines, facilitating various stages from hit identification to +candidate drug characterization. Applying machine learning models to these +datasets can prove challenging as they typically consist of multiple batches, +affected by experimental variation, especially if different imaging equipment +have been used. Moreover, as new data arrive, it is preferable that they are +analyzed in an online fashion. To overcome this, we propose CODA, an online +self-supervised domain adaptation approach. CODA divides the classifier's role +into a generic feature extractor and a task-specific model. We adapt the +feature extractor's weights to the new domain using cross-batch +self-supervision while keeping the task-specific model unchanged. Our results +demonstrate that this strategy significantly reduces the generalization gap, +achieving up to a 300% improvement when applied to data from different labs +utilizing different microscopes. CODA can be applied to new, unlabeled +out-of-domain data sources of different sizes, from a single plate to multiple +experimental batches. + +
+
+ comment: IEEE/CVF Winter Conference on Applications of Computer Vision (WACV + 2024) +
+
+
+
+
+ + ☆ Crowd management, crime detection, work monitoring using aiml + + +
+ This research endeavors to harness the potential of existing Closed-Circuit +Television (CCTV) networks for a comprehensive approach to crowd management, +crime prevention, and workplace monitoring through the integration of +Artificial Intelligence (AI) and Machine Learning (ML) technologies. The +primary objective is to develop and implement advanced algorithms capable of +real-time analysis of video feeds, enabling the identification and assessment +of crowd dynamics, early detection of potential criminal activities, and +continuous monitoring of workplace environments. By leveraging AI/ML, the +project aims to optimize surveillance capabilities, thereby enhancing public +safety measures and improving organizational productivity. This initiative +underscores the transformative impact that intelligent video analytics can have +on existing infrastructure, mitigating the need for extensive system overhauls +while significantly advancing security and operational efficiency. + +
+
+
+
+
+ + ☆ Leveraging Unlabeled Data for 3D Medical Image Segmentation through + Self-Supervised Contrastive Learning + + +
+ Current 3D semi-supervised segmentation methods face significant challenges +such as limited consideration of contextual information and the inability to +generate reliable pseudo-labels for effective unsupervised data use. To address +these challenges, we introduce two distinct subnetworks designed to explore and +exploit the discrepancies between them, ultimately correcting the erroneous +prediction results. More specifically, we identify regions of inconsistent +predictions and initiate a targeted verification training process. This +procedure strategically fine-tunes and harmonizes the predictions of the +subnetworks, leading to enhanced utilization of contextual information. +Furthermore, to adaptively fine-tune the network's representational capacity +and reduce prediction uncertainty, we employ a self-supervised contrastive +learning paradigm. For this, we use the network's confidence to distinguish +between reliable and unreliable predictions. The model is then trained to +effectively minimize unreliable predictions. Our experimental results for organ +segmentation, obtained from clinical MRI and CT scans, demonstrate the +effectiveness of our approach when compared to state-of-the-art methods. The +codebase is accessible on +\href{https://github.com/xmindflow/SSL-contrastive}{GitHub}. + +
+
+
+
+
+ + ☆ ChessVision -- A Dataset for Logically Coherent Multi-label + Classification + + +
+ Starting with early successes in computer vision tasks, deep learning based +techniques have since overtaken state of the art approaches in a multitude of +domains. However, it has been demonstrated time and again that these techniques +fail to capture semantic context and logical constraints, instead often relying +on spurious correlations to arrive at the answer. Since application of deep +learning techniques to critical scenarios are dependent on adherence to domain +specific constraints, several attempts have been made to address this issue. +One limitation holding back a thorough exploration of this area, is a lack of +suitable datasets which feature a rich set of rules. In order to address this, +we present the ChessVision Dataset, consisting of 200,000+ images of annotated +chess games in progress, requiring recreation of the game state from its +corresponding image. This is accompanied by a curated set of rules which +constrains the set of predictions to "reasonable" game states, and are designed +to probe key semantic abilities like localization and enumeration. Alongside +standard metrics, additional metrics to measure performance with regards to +logical consistency is presented. We analyze several popular and state of the +art vision models on this task, and show that, although their performance on +standard metrics are laudable, they produce a plethora of incoherent results, +indicating that this dataset presents a significant challenge for future works. + +
+
+
+
+
+ + ☆ Adaptive Dense Pseudo Label Selection for Semi-supervised Oriented + Object Detection + + +
+ Recently, dense pseudo-label, which directly selects pseudo labels from the +original output of the teacher model without any complicated post-processing +steps, has received considerable attention in semi-supervised object detection +(SSOD). However, for the multi-oriented and dense objects that are common in +aerial scenes, existing dense pseudo-label selection methods are inefficient +and impede the performance in semi-supervised oriented object detection. +Therefore, we propose Adaptive Dense Pseudo Label Selection (ADPLS) for +semi-supervised oriented object detection. In ADPLS, we design a simple but +effective adaptive mechanism to guide the selection of dense pseudo labels. +Specifically, we propose the mean Feature-Richness Score (mFRS) to estimate the +density of potential objects and use this score to adjust the number of dense +pseudo labels. On the DOTA-v1.5 benchmark, the proposed method outperforms +previous methods especially when labeled data are scarce. For example, it +achieves 49.78 mAP given only 5% of annotated data, which surpasses previous +state-of-the-art method given 10% of annotated data by 1.15 mAP. Our codes will +be available soon. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ☆ Surgical Temporal Action-aware Network with Sequence Regularization for + Phase Recognition + + +
+ To assist surgeons in the operating theatre, surgical phase recognition is +critical for developing computer-assisted surgical systems, which requires +comprehensive understanding of surgical videos. Although existing studies made +great progress, there are still two significant limitations worthy of +improvement. First, due to the compromise of resource consumption, frame-wise +visual features are extracted by 2D networks and disregard spatial and temporal +knowledge of surgical actions, which hinders subsequent inter-frame modeling +for phase prediction. Second, these works simply utilize ordinary +classification loss with one-hot phase labels to optimize the phase +predictions, and cannot fully explore surgical videos under inadequate +supervision. To overcome these two limitations, we propose a Surgical Temporal +Action-aware Network with sequence Regularization, named STAR-Net, to recognize +surgical phases more accurately from input videos. Specifically, we propose an +efficient multi-scale surgical temporal action (MS-STA) module, which +integrates visual features with spatial and temporal knowledge of surgical +actions at the cost of 2D networks. Moreover, we devise the dual-classifier +sequence regularization (DSR) to facilitate the training of STAR-Net by the +sequence guidance of an auxiliary classifier with a smaller capacity. Our +STAR-Net with MS-STA and DSR can exploit visual features of surgical actions +with effective regularization, thereby leading to the superior performance of +surgical phase recognition. Extensive experiments on a large-scale gastrectomy +surgery dataset and the public Cholec80 benchmark prove that our STAR-Net +significantly outperforms state-of-the-arts of surgical phase recognition. + +
+
+ comment: Accepted by 2023 IEEE International Conference on Bioinformatics and + Biomedicine (BIBM 2023) +
+
+
+
+
+ + ☆ TouchSDF: A DeepSDF Approach for 3D Shape Reconstruction using + Vision-Based Tactile Sensing + + +
+ Humans rely on their visual and tactile senses to develop a comprehensive 3D +understanding of their physical environment. Recently, there has been a growing +interest in exploring and manipulating objects using data-driven approaches +that utilise high-resolution vision-based tactile sensors. However, 3D shape +reconstruction using tactile sensing has lagged behind visual shape +reconstruction because of limitations in existing techniques, including the +inability to generalise over unseen shapes, the absence of real-world testing, +and limited expressive capacity imposed by discrete representations. To address +these challenges, we propose TouchSDF, a Deep Learning approach for tactile 3D +shape reconstruction that leverages the rich information provided by a +vision-based tactile sensor and the expressivity of the implicit neural +representation DeepSDF. Our technique consists of two components: (1) a +Convolutional Neural Network that maps tactile images into local meshes +representing the surface at the touch location, and (2) an implicit neural +function that predicts a signed distance function to extract the desired 3D +shape. This combination allows TouchSDF to reconstruct smooth and continuous 3D +shapes from tactile inputs in simulation and real-world settings, opening up +research avenues for robust 3D-aware representations and improved multimodal +perception in robotics. Code and supplementary material are available at: +https://touchsdf.github.io/ + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Deep learning-based detection of morphological features associated with + hypoxia in H&E breast cancer whole slide images + + +
+ Hypoxia occurs when tumour cells outgrow their blood supply, leading to +regions of low oxygen levels within the tumour. Calculating hypoxia levels can +be an important step in understanding the biology of tumours, their clinical +progression and response to treatment. This study demonstrates a novel +application of deep learning to evaluate hypoxia in the context of breast +cancer histomorphology. More precisely, we show that Weakly Supervised Deep +Learning (WSDL) models can accurately detect hypoxia associated features in +routine Hematoxylin and Eosin (H&E) whole slide images (WSI). We trained and +evaluated a deep Multiple Instance Learning model on tiles from WSI H&E tissue +from breast cancer primary sites (n=240) obtaining on average an AUC of 0.87 on +a left-out test set. We also showed significant differences between features of +hypoxic and normoxic tissue regions as distinguished by the WSDL models. Such +DL hypoxia H&E WSI detection models could potentially be extended to other +tumour types and easily integrated into the pathology workflow without +requiring additional costly assays. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Improving Source-Free Target Adaptation with Vision Transformers + Leveraging Domain Representation Images + + +
+ Unsupervised Domain Adaptation (UDA) methods facilitate knowledge transfer +from a labeled source domain to an unlabeled target domain, navigating the +obstacle of domain shift. While Convolutional Neural Networks (CNNs) are a +staple in UDA, the rise of Vision Transformers (ViTs) provides new avenues for +domain generalization. This paper presents an innovative method to bolster ViT +performance in source-free target adaptation, beginning with an evaluation of +how key, query, and value elements affect ViT outcomes. Experiments indicate +that altering the key component has negligible effects on Transformer +performance. Leveraging this discovery, we introduce Domain Representation +Images (DRIs), feeding embeddings through the key element. DRIs act as +domain-specific markers, effortlessly merging with the training regimen. To +assess our method, we perform target adaptation tests on the Cross Instance DRI +source-only (SO) control. We measure the efficacy of target adaptation with and +without DRIs, against existing benchmarks like SHOT-B* and adaptations via +CDTrans. Findings demonstrate that excluding DRIs offers limited gains over +SHOT-B*, while their inclusion in the key segment boosts average precision +promoting superior domain generalization. This research underscores the vital +role of DRIs in enhancing ViT efficiency in UDA scenarios, setting a precedent +for further domain adaptation explorations. + +
+
+
+
+
+ + ☆ HiPose: Hierarchical Binary Surface Encoding and Correspondence Pruning + for RGB-D 6DoF Object Pose Estimation + + +
+ In this work, we present a novel dense-correspondence method for 6DoF object +pose estimation from a single RGB-D image. While many existing data-driven +methods achieve impressive performance, they tend to be time-consuming due to +their reliance on rendering-based refinement approaches. To circumvent this +limitation, we present HiPose, which establishes 3D-3D correspondences in a +coarse-to-fine manner with a hierarchical binary surface encoding. Unlike +previous dense-correspondence methods, we estimate the correspondence surface +by employing point-to-surface matching and iteratively constricting the surface +until it becomes a correspondence point while gradually removing outliers. +Extensive experiments on public benchmarks LM-O, YCB-V, and T-Less demonstrate +that our method surpasses all refinement-free methods and is even on par with +expensive refinement-based approaches. Crucially, our approach is +computationally efficient and enables real-time critical applications with high +accuracy requirements. Code and models will be released. + +
+
+
+
+
+ + ☆ Echocardiogram Foundation Model -- Application 1: Estimating Ejection + Fraction + + +
+ Cardiovascular diseases stand as the primary global cause of mortality. Among +the various imaging techniques available for visualising the heart and +evaluating its function, echocardiograms emerge as the preferred choice due to +their safety and low cost. Quantifying cardiac function based on +echocardiograms is very laborious, time-consuming and subject to high +interoperator variability. In this work, we introduce EchoAI, an echocardiogram +foundation model, that is trained using self-supervised learning (SSL) on 1.5 +million echocardiograms. We evaluate our approach by fine-tuning EchoAI to +estimate the ejection fraction achieving a mean absolute percentage error of +9.40%. This level of accuracy aligns with the performance of expert +sonographers. + +
+
+
+
+
+ + ☆ A Region of Interest Focused Triple UNet Architecture for Skin Lesion + Segmentation + + +
+ Skin lesion segmentation is of great significance for skin lesion analysis +and subsequent treatment. It is still a challenging task due to the irregular +and fuzzy lesion borders, and diversity of skin lesions. In this paper, we +propose Triple-UNet to automatically segment skin lesions. It is an organic +combination of three UNet architectures with suitable modules. In order to +concatenate the first and second sub-networks more effectively, we design a +region of interest enhancement module (ROIE). The ROIE enhances the target +object region of the image by using the predicted score map of the first UNet. +The features learned by the first UNet and the enhanced image help the second +UNet obtain a better score map. Finally, the results are fine-tuned by the +third UNet. We evaluate our algorithm on a publicly available dataset of skin +lesion segmentation. Experiments show that Triple-UNet outperforms the +state-of-the-art on skin lesion segmentation. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Multi-Resolution Planar Region Extraction for Uneven Terrains + + +
+ This paper studies the problem of extracting planar regions in uneven +terrains from unordered point cloud measurements. Such a problem is critical in +various robotic applications such as robotic perceptive locomotion. While +existing approaches have shown promising results in effectively extracting +planar regions from the environment, they often suffer from issues such as low +computational efficiency or loss of resolution. To address these issues, we +propose a multi-resolution planar region extraction strategy in this paper that +balances the accuracy in boundaries and computational efficiency. Our method +begins with a pointwise classification preprocessing module, which categorizes +all sampled points according to their local geometric properties to facilitate +multi-resolution segmentation. Subsequently, we arrange the categorized points +using an octree, followed by an in-depth analysis of nodes to finish +multi-resolution plane segmentation. The efficiency and robustness of the +proposed approach are verified via synthetic and real-world experiments, +demonstrating our method's ability to generalize effectively across various +uneven terrains while maintaining real-time performance, achieving frame rates +exceeding 35 FPS. + +
+
+
+
+
+ + ☆ Convolutional Neural Networks for Neuroimaging in Parkinson's Disease: + Is Preprocessing Needed? + + +
+ Spatial and intensity normalization are nowadays a prerequisite for +neuroimaging analysis. Influenced by voxel-wise and other univariate +comparisons, where these corrections are key, they are commonly applied to any +type of analysis and imaging modalities. Nuclear imaging modalities such as +PET-FDG or FP-CIT SPECT, a common modality used in Parkinson's Disease +diagnosis, are especially dependent on intensity normalization. However, these +steps are computationally expensive and furthermore, they may introduce +deformations in the images, altering the information contained in them. +Convolutional Neural Networks (CNNs), for their part, introduce position +invariance to pattern recognition, and have been proven to classify objects +regardless of their orientation, size, angle, etc. Therefore, a question +arises: how well can CNNs account for spatial and intensity differences when +analysing nuclear brain imaging? Are spatial and intensity normalization still +needed? To answer this question, we have trained four different CNN models +based on well-established architectures, using or not different spatial and +intensity normalization preprocessing. The results show that a sufficiently +complex model such as our three-dimensional version of the ALEXNET can +effectively account for spatial differences, achieving a diagnosis accuracy of +94.1% with an area under the ROC curve of 0.984. The visualization of the +differences via saliency maps shows that these models are correctly finding +patterns that match those found in the literature, without the need of applying +any complex spatial normalization procedure. However, the intensity +normalization -- and its type -- is revealed as very influential in the results +and accuracy of the trained model, and therefore must be well accounted. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ Benchmarking bias: Expanding clinical AI model card to incorporate bias + reporting of social and non-social factors + + +
+ Clinical AI model reporting cards should be expanded to incorporate a broad +bias reporting of both social and non-social factors. Non-social factors +consider the role of other factors, such as disease dependent, anatomic, or +instrument factors on AI model bias, which are essential to ensure safe +deployment. + +
+
+
+
+
+ + ☆ "HoVer-UNet": Accelerating HoVerNet with UNet-based multi-class nuclei + segmentation via knowledge distillation + + +
+ We present "HoVer-UNet", an approach to distill the knowledge of the +multi-branch HoVerNet framework for nuclei instance segmentation and +classification in histopathology. We propose a compact, streamlined single UNet +network with a Mix Vision Transformer backbone, and equip it with a custom loss +function to optimally encode the distilled knowledge of HoVerNet, reducing +computational requirements without compromising performances. We show that our +model achieved results comparable to HoVerNet on the public PanNuke and Consep +datasets with a three-fold reduction in inference time. We make the code of our +model publicly available at https://github.com/DIAGNijmegen/HoVer-UNet. + +
+
+ comment: 4 pages, 2 figures, submitted to ISBI 2024 +
+
+
+
+
+ + ☆ GMISeg: General Medical Image Segmentation without Re-Training + + +
+ Although deep learning models have become the main method for medical image +segmentation, they often cannot be extended to unknown segmentation tasks +involving new anatomical structures, image shapes, or labels. For new +segmentation tasks, researchers often have to retrain or fine-tune the model, +which is time-consuming and poses a significant obstacle to clinical +researchers, who often lack the resources and professional knowledge to train +neural networks. Therefore, we proposed a general method that can solve unknown +medical image segmentation tasks without requiring additional training. Given +an example set of images and prompts for defining new segmentation tasks, +GMISeg applies a novel low-rank fine-tuning strategy based on the proposed +approach to the SAM (Segment Anything Model) image encoder, and works with the +prompt encoder and mask decoder to fine-tune the labeled dataset without the +need for additional training. To achieve generalization of new tasks, we used +medical image datasets with different imaging modes for different parts. We +trained and generalized GMISeg on a different set of anatomical and imaging +modes using cardiac images on other site datasets. We have demonstrated that +GMISeg outperforms the latest methods on unknown tasks and have conducted a +comprehensive analysis and summary of the important performance of the proposed +method. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2304.06131 by other authors +
+
+
+
+
+ + ☆ Hyb-NeRF: A Multiresolution Hybrid Encoding for Neural Radiance Fields WACV2024 + + +
+ Recent advances in Neural radiance fields (NeRF) have enabled high-fidelity +scene reconstruction for novel view synthesis. However, NeRF requires hundreds +of network evaluations per pixel to approximate a volume rendering integral, +making it slow to train. Caching NeRFs into explicit data structures can +effectively enhance rendering speed but at the cost of higher memory usage. To +address these issues, we present Hyb-NeRF, a novel neural radiance field with a +multi-resolution hybrid encoding that achieves efficient neural modeling and +fast rendering, which also allows for high-quality novel view synthesis. The +key idea of Hyb-NeRF is to represent the scene using different encoding +strategies from coarse-to-fine resolution levels. Hyb-NeRF exploits +memory-efficiency learnable positional features at coarse resolutions and the +fast optimization speed and local details of hash-based feature grids at fine +resolutions. In addition, to further boost performance, we embed cone +tracing-based features in our learnable positional encoding that eliminates +encoding ambiguity and reduces aliasing artifacts. Extensive experiments on +both synthetic and real-world datasets show that Hyb-NeRF achieves faster +rendering speed with better rending quality and even a lower memory footprint +in comparison to previous state-of-the-art methods. + +
+
+ comment: WACV2024 +
+
+
+
+
+ + ☆ HCA-Net: Hierarchical Context Attention Network for Intervertebral Disc + Semantic Labeling + + +
+ Accurate and automated segmentation of intervertebral discs (IVDs) in medical +images is crucial for assessing spine-related disorders, such as osteoporosis, +vertebral fractures, or IVD herniation. We present HCA-Net, a novel contextual +attention network architecture for semantic labeling of IVDs, with a special +focus on exploiting prior geometric information. Our approach excels at +processing features across different scales and effectively consolidating them +to capture the intricate spatial relationships within the spinal cord. To +achieve this, HCA-Net models IVD labeling as a pose estimation problem, aiming +to minimize the discrepancy between each predicted IVD location and its +corresponding actual joint location. In addition, we introduce a skeletal loss +term to reinforce the model's geometric dependence on the spine. This loss +function is designed to constrain the model's predictions to a range that +matches the general structure of the human vertebral skeleton. As a result, the +network learns to reduce the occurrence of false predictions and adaptively +improves the accuracy of IVD location estimation. Through extensive +experimental evaluation on multi-center spine datasets, our approach +consistently outperforms previous state-of-the-art methods on both MRI T1w and +T2w modalities. The codebase is accessible to the public on +\href{https://github.com/xmindflow/HCA-Net}{GitHub}. + +
+
+
+
+
+ + ☆ Speaker-Adapted End-to-End Visual Speech Recognition for Continuous + Spanish + + +
+ Different studies have shown the importance of visual cues throughout the +speech perception process. In fact, the development of audiovisual approaches +has led to advances in the field of speech technologies. However, although +noticeable results have recently been achieved, visual speech recognition +remains an open research problem. It is a task in which, by dispensing with the +auditory sense, challenges such as visual ambiguities and the complexity of +modeling silence must be faced. Nonetheless, some of these challenges can be +alleviated when the problem is approached from a speaker-dependent perspective. +Thus, this paper studies, using the Spanish LIP-RTVE database, how the +estimation of specialized end-to-end systems for a specific person could affect +the quality of speech recognition. First, different adaptation strategies based +on the fine-tuning technique were proposed. Then, a pre-trained CTC/Attention +architecture was used as a baseline throughout our experiments. Our findings +showed that a two-step fine-tuning process, where the VSR system is first +adapted to the task domain, provided significant improvements when the speaker +adaptation was addressed. Furthermore, results comparable to the current state +of the art were reached even when only a limited amount of data was available. + +
+
+ comment: Accepted in Proceedings of IberSpeech 2022 ( + https://www.isca-speech.org/archive/iberspeech_2022/gimenogomez22_iberspeech.html + ) +
+
+
+
+
+ + ☆ MaskFlow: Object-Aware Motion Estimation + + +
+ We introduce a novel motion estimation method, MaskFlow, that is capable of +estimating accurate motion fields, even in very challenging cases with small +objects, large displacements and drastic appearance changes. In addition to +lower-level features, that are used in other Deep Neural Network (DNN)-based +motion estimation methods, MaskFlow draws from object-level features and +segmentations. These features and segmentations are used to approximate the +objects' translation motion field. We propose a novel and effective way of +incorporating the incomplete translation motion field into a subsequent motion +estimation network for refinement and completion. We also produced a new +challenging synthetic dataset with motion field ground truth, and also provide +extra ground truth for the object-instance matchings and corresponding +segmentation masks. We demonstrate that MaskFlow outperforms state of the art +methods when evaluated on our new challenging dataset, whilst still producing +comparable results on the popular FlyingThings3D benchmark dataset. + +
+
+
+
+
+ + ☆ Analysis of Visual Features for Continuous Lipreading in Spanish + + +
+ During a conversation, our brain is responsible for combining information +obtained from multiple senses in order to improve our ability to understand the +message we are perceiving. Different studies have shown the importance of +presenting visual information in these situations. Nevertheless, lipreading is +a complex task whose objective is to interpret speech when audio is not +available. By dispensing with a sense as crucial as hearing, it will be +necessary to be aware of the challenge that this lack presents. In this paper, +we propose an analysis of different speech visual features with the intention +of identifying which of them is the best approach to capture the nature of lip +movements for natural Spanish and, in this way, dealing with the automatic +visual speech recognition task. In order to estimate our system, we present an +audiovisual corpus compiled from a subset of the RTVE database, which has been +used in the Albayz\'in evaluations. We employ a traditional system based on +Hidden Markov Models with Gaussian Mixture Models. Results show that, although +the task is difficult, in restricted conditions we obtain recognition results +which determine that using eigenlips in combination with deep features is the +best visual approach. + +
+
+ comment: Accepted in Proceedings of IberSpeech 2020 ( + https://www.isca-speech.org/archive/iberspeech_2021/gimenogomez21_iberspeech.html + ) +
+
+
+
+
+ + ☆ GLAD: Global-Local View Alignment and Background Debiasing for + Unsupervised Video Domain Adaptation with Large Domain Gap WACV 2024 + + +
+ In this work, we tackle the challenging problem of unsupervised video domain +adaptation (UVDA) for action recognition. We specifically focus on scenarios +with a substantial domain gap, in contrast to existing works primarily deal +with small domain gaps between labeled source domains and unlabeled target +domains. To establish a more realistic setting, we introduce a novel UVDA +scenario, denoted as Kinetics->BABEL, with a more considerable domain gap in +terms of both temporal dynamics and background shifts. To tackle the temporal +shift, i.e., action duration difference between the source and target domains, +we propose a global-local view alignment approach. To mitigate the background +shift, we propose to learn temporal order sensitive representations by temporal +order learning and background invariant representations by background +augmentation. We empirically validate that the proposed method shows +significant improvement over the existing methods on the Kinetics->BABEL +dataset with a large domain gap. The code is available at +https://github.com/KHUVLL/GLAD. + +
+
+ comment: This is an accepted WACV 2024 paper +
+
+
+
+
+ + ☆ HiFi-Syn: Hierarchical Granularity Discrimination for High-Fidelity + Synthesis of MR Images with Structure Preservation + + +
+ Synthesizing medical images while preserving their structural information is +crucial in medical research. In such scenarios, the preservation of anatomical +content becomes especially important. Although recent advances have been made +by incorporating instance-level information to guide translation, these methods +overlook the spatial coherence of structural-level representation and the +anatomical invariance of content during translation. To address these issues, +we introduce hierarchical granularity discrimination, which exploits various +levels of semantic information present in medical images. Our strategy utilizes +three levels of discrimination granularity: pixel-level discrimination using a +Brain Memory Bank, structure-level discrimination on each brain structure with +a re-weighting strategy to focus on hard samples, and global-level +discrimination to ensure anatomical consistency during translation. The image +translation performance of our strategy has been evaluated on three independent +datasets (UK Biobank, IXI, and BraTS 2018), and it has outperformed +state-of-the-art algorithms. Particularly, our model excels not only in +synthesizing normal structures but also in handling abnormal (pathological) +structures, such as brain tumors, despite the variations in contrast observed +across different imaging modalities due to their pathological characteristics. +The diagnostic value of synthesized MR images containing brain tumors has been +evaluated by radiologists. This indicates that our model may offer an +alternative solution in scenarios where specific MR modalities of patients are +unavailable. Extensive experiments further demonstrate the versatility of our +method, providing unique insights into medical image translation. + +
+
+
+
+
+ + ☆ LIP-RTVE: An Audiovisual Database for Continuous Spanish in the Wild LREC 2022 + + +
+ Speech is considered as a multi-modal process where hearing and vision are +two fundamentals pillars. In fact, several studies have demonstrated that the +robustness of Automatic Speech Recognition systems can be improved when audio +and visual cues are combined to represent the nature of speech. In addition, +Visual Speech Recognition, an open research problem whose purpose is to +interpret speech by reading the lips of the speaker, has been a focus of +interest in the last decades. Nevertheless, in order to estimate these systems +in the currently Deep Learning era, large-scale databases are required. On the +other hand, while most of these databases are dedicated to English, other +languages lack sufficient resources. Thus, this paper presents a +semi-automatically annotated audiovisual database to deal with unconstrained +natural Spanish, providing 13 hours of data extracted from Spanish television. +Furthermore, baseline results for both speaker-dependent and +speaker-independent scenarios are reported using Hidden Markov Models, a +traditional paradigm that has been widely used in the field of Speech +Technologies. + +
+
+ comment: Accepted in Proceedings of LREC 2022 ( + https://aclanthology.org/2022.lrec-1.294 ) +
+
+
+
+
+ + ☆ Learning Site-specific Styles for Multi-institutional Unsupervised + Cross-modality Domain Adaptation + + +
+ Unsupervised cross-modality domain adaptation is a challenging task in +medical image analysis, and it becomes more challenging when source and target +domain data are collected from multiple institutions. In this paper, we present +our solution to tackle the multi-institutional unsupervised domain adaptation +for the crossMoDA 2023 challenge. First, we perform unpaired image translation +to translate the source domain images to the target domain, where we design a +dynamic network to generate synthetic target domain images with controllable, +site-specific styles. Afterwards, we train a segmentation model using the +synthetic images and further reduce the domain gap by self-training. Our +solution achieved the 1st place during both the validation and testing phases +of the challenge. + +
+
+ comment: crossMoDA 2023 challenge 1st place solution +
+
+
+
+
+ + ☆ AR Visualization System for Ship Detection and Recognition Based on AI + + +
+ Augmented reality technology has been widely used in industrial design +interaction, exhibition guide, information retrieval and other fields. The +combination of artificial intelligence and augmented reality technology has +also become a future development trend. This project is an AR visualization +system for ship detection and recognition based on AI, which mainly includes +three parts: artificial intelligence module, Unity development module and +Hololens2AR module. This project is based on R3Det algorithm to complete the +detection and recognition of ships in remote sensing images. The recognition +rate of model detection trained on RTX 2080Ti can reach 96%. Then, the 3D model +of the ship is obtained by ship categories and information and generated in the +virtual scene. At the same time, voice module and UI interaction module are +added. Finally, we completed the deployment of the project on Hololens2 through +MRTK. The system realizes the fusion of computer vision and augmented reality +technology, which maps the results of object detection to the AR field, and +makes a brave step toward the future technological trend and intelligent +application. + +
+
+ comment: 4 pages,7 figures,IEEE International Conference on Virtual Reality + and Visualization +
+
+
+
+
+ + ☆ Two Views Are Better than One: Monocular 3D Pose Estimation with + Multiview Consistency + + +
+ Deducing a 3D human pose from a single 2D image or 2D keypoints is inherently +challenging, given the fundamental ambiguity wherein multiple 3D poses can +correspond to the same 2D representation. The acquisition of 3D data, while +invaluable for resolving pose ambiguity, is expensive and requires an intricate +setup, often restricting its applicability to controlled lab environments. We +improve performance of monocular human pose estimation models using multiview +data for fine-tuning. We propose a novel loss function, multiview consistency, +to enable adding additional training data with only 2D supervision. This loss +enforces that the inferred 3D pose from one view aligns with the inferred 3D +pose from another view under similarity transformations. Our consistency loss +substantially improves performance for fine-tuning with no available 3D data. +Our experiments demonstrate that two views offset by 90 degrees are enough to +obtain good performance, with only marginal improvements by adding more views. +Thus, we enable the acquisition of domain-specific data by capturing activities +with off-the-shelf cameras, eliminating the need for elaborate calibration +procedures. This research introduces new possibilities for domain adaptation in +3D pose estimation, providing a practical and cost-effective solution to +customize models for specific applications. The used dataset, featuring +additional views, will be made publicly available. + +
+
+
+
+
+ + ☆ Board-to-Board: Evaluating Moonboard Grade Prediction Generalization + + +
+ Bouldering is a sport where athletes aim to climb up an obstacle using a set +of defined holds called a route. Typically routes are assigned a grade to +inform climbers of its difficulty and allow them to more easily track their +progression. However, the variation in individual climbers technical and +physical attributes and many nuances of an individual route make grading a +difficult and often biased task. In this work, we apply classical and +deep-learning modelling techniques to the 2016, 2017 and 2019 Moonboard +datasets, achieving state of the art grade prediction performance with 0.87 MAE +and 1.12 RMSE. We achieve this performance on a feature-set that does not +require decomposing routes into individual moves, which is a method common in +literature and introduces bias. We also demonstrate the generalization +capability of this model between editions and introduce a novel vision-based +method of grade prediction. While the generalization performance of these +techniques is below human level performance currently, we propose these methods +as a basis for future work. Such a tool could be implemented in pre-existing +mobile applications and would allow climbers to better track their progress and +assess new routes with reduced bias. + +
+
+
+
+
+ + ☆ Learning Part Motion of Articulated Objects Using Spatially Continuous + Neural Implicit Representations BMVC 2023 + + +
+ Articulated objects (e.g., doors and drawers) exist everywhere in our life. +Different from rigid objects, articulated objects have higher degrees of +freedom and are rich in geometries, semantics, and part functions. Modeling +different kinds of parts and articulations with nerual networks plays an +essential role in articulated object understanding and manipulation, and will +further benefit 3D vision and robotics communities. To model articulated +objects, most previous works directly encode articulated objects into feature +representations, without specific designs for parts, articulations and part +motions. In this paper, we introduce a novel framework that explicitly +disentangles the part motion of articulated objects by predicting the +transformation matrix of points on the part surface, using spatially continuous +neural implicit representations to model the part motion smoothly in the space. +More importantly, while many methods could only model a certain kind of joint +motion (such as the revolution in the clockwise order), our proposed framework +is generic to different kinds of joint motions in that transformation matrix +can model diverse kinds of joint motions in the space. Quantitative and +qualitative results of experiments over diverse categories of articulated +objects demonstrate the effectiveness of our proposed framework. + +
+
+ comment: 10 pages, 6 figures. Accepted by BMVC 2023 +
+
+
+
+
+ + ☆ CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal + Relationships + + +
+ Integrating deep learning and causal discovery has increased the +interpretability of Temporal Action Segmentation (TAS) tasks. However, +frame-level causal relationships exist many complicated noises outside the +segment-level, making it infeasible to directly express macro action semantics. +Thus, we propose \textit{\textbf{Causal Abstraction Segmentation Refiner +(CASR)}}, which can refine TAS results from various models by enhancing video +causality in marginalizing frame-level casual relationships. Specifically, we +define the equivalent frame-level casual model and segment-level causal model, +so that the causal adjacency matrix constructed from marginalized frame-level +causal relationships has the ability to represent the segmnet-level causal +relationships. CASR works out by reducing the difference in the causal +adjacency matrix between we constructed and pre-segmentation results of +backbone models. In addition, we propose a novel evaluation metric Causal Edit +Distance (CED) to evaluate the causal interpretability. Extensive experimental +results on mainstream datasets indicate that CASR significantly surpasses +existing various methods in action segmentation performance, as well as in +causal explainability and generalization. Our code will be available soon. + +
+
+
+
+
+ + ☆ RFTrans: Leveraging Refractive Flow of Transparent Objects for Surface + Normal Estimation and Manipulation + + +
+ Transparent objects are widely used in our daily lives, making it important +to teach robots to interact with them. However, it's not easy because the +reflective and refractive effects can make RGB-D cameras fail to give accurate +geometry measurements. To solve this problem, this paper introduces RFTrans, an +RGB-D-based method for surface normal estimation and manipulation of +transparent objects. By leveraging refractive flow as an intermediate +representation, RFTrans circumvents the drawbacks of directly predicting the +geometry (e.g. surface normal) from RGB images and helps bridge the sim-to-real +gap. RFTrans integrates the RFNet, which predicts refractive flow, object mask, +and boundaries, followed by the F2Net, which estimates surface normal from the +refractive flow. To make manipulation possible, a global optimization module +will take in the predictions, refine the raw depth, and construct the point +cloud with normal. An analytical grasp planning algorithm, ISF, is followed to +generate the grasp poses. We build a synthetic dataset with physically +plausible ray-tracing rendering techniques to train the networks. Results show +that the RFTrans trained on the synthetic dataset can consistently outperform +the baseline ClearGrasp in both synthetic and real-world benchmarks by a large +margin. Finally, a real-world robot grasping task witnesses an 83% success +rate, proving that refractive flow can help enable direct sim-to-real transfer. +The code, data, and supplementary materials are available at +https://rftrans.robotflow.ai. + +
+
+
+
+
+ + ☆ Rich and Poor Texture Contrast: A Simple yet Effective Approach for + AI-generated Image Detection + + +
+ Recent generative models show impressive performance in generating +photographic images. Humans can hardly distinguish such incredibly +realistic-looking AI-generated images from real ones. AI-generated images may +lead to ubiquitous disinformation dissemination. Therefore, it is of utmost +urgency to develop a detector to identify AI-generated images. Most existing +detectors suffer from sharp performance drops over unseen generative models. In +this paper, we propose a novel AI-generated image detector capable of +identifying fake images created by a wide range of generative models. Our +approach leverages the inter-pixel correlation contrast between rich and poor +texture regions within an image. Pixels in rich texture regions exhibit more +significant fluctuations than those in poor texture regions. This discrepancy +reflects that the entropy of rich texture regions is larger than that of poor +ones. Consequently, synthesizing realistic rich texture regions proves to be +more challenging for existing generative models. Based on this principle, we +divide an image into multiple patches and reconstruct them into two images, +comprising rich-texture and poor-texture patches respectively. Subsequently, we +extract the inter-pixel correlation discrepancy feature between rich and poor +texture regions. This feature serves as a universal fingerprint used for +AI-generated image forensics across different generative models. In addition, +we build a comprehensive AI-generated image detection benchmark, which includes +16 kinds of prevalent generative models, to evaluate the effectiveness of +existing baselines and our approach. Our benchmark provides a leaderboard for +follow-up studies. Extensive experimental results show that our approach +outperforms state-of-the-art baselines by a significant margin. Our project: +https://fdmas.github.io/AIGCDetect/ + +
+
+ comment: Our project: https://fdmas.github.io/AIGCDetect/ +
+
+
+
+
+ + ☆ From Wrong To Right: A Recursive Approach Towards Vision-Language + Explanation EMNLP 2023 + + +
+ Addressing the challenge of adapting pre-trained vision-language models for +generating insightful explanations for visual reasoning tasks with limited +annotations, we present ReVisE: a $\textbf{Re}$cursive $\textbf{Vis}$ual +$\textbf{E}$xplanation algorithm. Our method iteratively computes visual +features (conditioned on the text input), an answer, and an explanation, to +improve the explanation quality step by step until the answer converges. We +find that this multi-step approach guides the model to correct its own answers +and outperforms single-step explanation generation. Furthermore, explanations +generated by ReVisE also serve as valuable annotations for few-shot +self-training. Our approach outperforms previous methods while utilizing merely +5% of the human-annotated explanations across 10 metrics, demonstrating up to a +4.2 and 1.3 increase in BLEU-1 score on the VCR and VQA-X datasets, +underscoring the efficacy and data-efficiency of our method. + +
+
+ comment: EMNLP 2023 Main +
+
+
+
+
+ + ☆ Point, Segment and Count: A Generalized Framework for Object Counting + + +
+ Class-agnostic object counting aims to count all objects in an image with +respect to example boxes or class names, \emph{a.k.a} few-shot and zero-shot +counting. Current state-of-the-art methods highly rely on density maps to +predict object counts, which lacks model interpretability. In this paper, we +propose a generalized framework for both few-shot and zero-shot object counting +based on detection. Our framework combines the superior advantages of two +foundation models without compromising their zero-shot capability: (\textbf{i}) +SAM to segment all possible objects as mask proposals, and (\textbf{ii}) CLIP +to classify proposals to obtain accurate object counts. However, this strategy +meets the obstacles of efficiency overhead and the small crowded objects that +cannot be localized and distinguished. To address these issues, our framework, +termed PseCo, follows three steps: point, segment, and count. Specifically, we +first propose a class-agnostic object localization to provide accurate but +least point prompts for SAM, which consequently not only reduces computation +costs but also avoids missing small objects. Furthermore, we propose a +generalized object classification that leverages CLIP image/text embeddings as +the classifier, following a hierarchical knowledge distillation to obtain +discriminative classifications among hierarchical mask proposals. Extensive +experimental results on FSC-147 dataset demonstrate that PseCo achieves +state-of-the-art performance in both few-shot/zero-shot object +counting/detection, with additional results on large-scale COCO and LVIS +datasets. The source code is available at +\url{https://github.com/Hzzone/PseCo}. + +
+
+
+
+
+ + ☆ Semi-supervised Medical Image Segmentation via Query Distribution + Consistency + + +
+ Semi-supervised learning is increasingly popular in medical image +segmentation due to its ability to leverage large amounts of unlabeled data to +extract additional information. However, most existing semi-supervised +segmentation methods focus only on extracting information from unlabeled data. +In this paper, we propose a novel Dual KMax UX-Net framework that leverages +labeled data to guide the extraction of information from unlabeled data. Our +approach is based on a mutual learning strategy that incorporates two modules: +3D UX-Net as our backbone meta-architecture and KMax decoder to enhance the +segmentation performance. Extensive experiments on the Atrial Segmentation +Challenge dataset have shown that our method can significantly improve +performance by merging unlabeled data. Meanwhile, our framework outperforms +state-of-the-art semi-supervised learning methods on 10\% and 20\% labeled +settings. Code located at: https://github.com/Rows21/DK-UXNet. + +
+
+ comment: Submitted to IEEE ISBI 2024 +
+
+
+
+
+ + ☆ Post-Training Quantization with Low-precision Minifloats and Integers on + FPGAs + + +
+ Post-Training Quantization (PTQ) is a powerful technique for model +compression, reducing the precision of neural networks without additional +training overhead. Recent works have investigated adopting 8-bit floating-point +quantization (FP8) in the context of PTQ for model inference. However, the +exploration of floating-point formats smaller than 8 bits and their comparison +with integer quantization remains relatively limited. In this work, we present +minifloats, which are reduced-precision floating-point formats capable of +further reducing the memory footprint, latency, and energy cost of a model +while approaching full-precision model accuracy. Our work presents a novel PTQ +design-space exploration, comparing minifloat and integer quantization schemes +across a range of 3 to 8 bits for both weights and activations. We examine the +applicability of various PTQ techniques to minifloats, including weight +equalization, bias correction, SmoothQuant, gradient-based learned rounding, +and the GPTQ method. Our experiments validate the effectiveness of +low-precision minifloats when compared to their integer counterparts across a +spectrum of accuracy-precision trade-offs on a set of reference deep learning +vision workloads. Finally, we evaluate our results against an FPGA-based +hardware cost model, showing that integer quantization often remains the +Pareto-optimal option, given its relatively smaller hardware resource +footprint. + +
+
+
+
+
+ + ☆ Stable Diffusion For Aerial Object Detection NeurIPS 2023 + + +
+ Aerial object detection is a challenging task, in which one major obstacle +lies in the limitations of large-scale data collection and the long-tail +distribution of certain classes. Synthetic data offers a promising solution, +especially with recent advances in diffusion-based methods like stable +diffusion (SD). However, the direct application of diffusion methods to aerial +domains poses unique challenges: stable diffusion's optimization for rich +ground-level semantics doesn't align with the sparse nature of aerial objects, +and the extraction of post-synthesis object coordinates remains problematic. To +address these challenges, we introduce a synthetic data augmentation framework +tailored for aerial images. It encompasses sparse-to-dense region of interest +(ROI) extraction to bridge the semantic gap, fine-tuning the diffusion model +with low-rank adaptation (LORA) to circumvent exhaustive retraining, and +finally, a Copy-Paste method to compose synthesized objects with backgrounds, +providing a nuanced approach to aerial object detection through synthetic data. + +
+
+ comment: Accepted at NeurIPS 2023 Synthetic Data Generation with Generative AI + workshop +
+
+
+
+
+ + ☆ Modality Mixer Exploiting Complementary Information for Multi-modal + Action Recognition + + +
+ Due to the distinctive characteristics of sensors, each modality exhibits +unique physical properties. For this reason, in the context of multi-modal +action recognition, it is important to consider not only the overall action +content but also the complementary nature of different modalities. In this +paper, we propose a novel network, named Modality Mixer (M-Mixer) network, +which effectively leverages and incorporates the complementary information +across modalities with the temporal context of actions for action recognition. +A key component of our proposed M-Mixer is the Multi-modal Contextualization +Unit (MCU), a simple yet effective recurrent unit. Our MCU is responsible for +temporally encoding a sequence of one modality (e.g., RGB) with action content +features of other modalities (e.g., depth and infrared modalities). This +process encourages M-Mixer network to exploit global action content and also to +supplement complementary information of other modalities. Furthermore, to +extract appropriate complementary information regarding to the given modality +settings, we introduce a new module, named Complementary Feature Extraction +Module (CFEM). CFEM incorporates sepearte learnable query embeddings for each +modality, which guide CFEM to extract complementary information and global +action content from the other modalities. As a result, our proposed method +outperforms state-of-the-art methods on NTU RGB+D 60, NTU RGB+D 120, and +NW-UCLA datasets. Moreover, through comprehensive ablation studies, we further +validate the effectiveness of our proposed method. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2208.11314 +
+
+
+
+
+ + ☆ LoCo: Locally Constrained Training-Free Layout-to-Image Synthesis + + +
+ Recent text-to-image diffusion models have reached an unprecedented level in +generating high-quality images. However, their exclusive reliance on textual +prompts often falls short in accurately conveying fine-grained spatial +compositions. In this paper, we propose LoCo, a training-free approach for +layout-to-image synthesis that excels in producing high-quality images aligned +with both textual prompts and spatial layouts. Our method introduces a +Localized Attention Constraint to refine cross-attention for individual +objects, ensuring their precise placement in designated regions. We further +propose a Padding Token Constraint to leverage the semantic information +embedded in previously neglected padding tokens, thereby preventing the +undesired fusion of synthesized objects. LoCo seamlessly integrates into +existing text-to-image and layout-to-image models, significantly amplifying +their performance and effectively addressing semantic failures observed in +prior methods. Through extensive experiments, we showcase the superiority of +our approach, surpassing existing state-of-the-art training-free +layout-to-image methods both qualitatively and quantitatively across multiple +benchmarks. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ ViLaM: A Vision-Language Model with Enhanced Visual Grounding and + Generalization Capability + + +
+ Vision-language models have revolutionized human-computer interaction and +shown significant progress in multi-modal tasks. However, applying these models +to complex visual tasks like medical image analysis remains challenging. In +this study, we propose ViLaM, a unified Vision-Language transformer model that +integrates instruction tuning predicated on a large language model. This +approach enables us to optimally utilize the knowledge and reasoning capacities +of large pre-trained language models for an array of tasks encompassing both +language and vision. We employ frozen pre-trained encoders to encode and align +both image and text features, enabling ViLaM to handle a variety of visual +tasks following textual instructions. Besides, we've designed cycle training +for referring expressions to address the need for high-quality, paired +referring expression datasets for training large models in terms of both +quantity and quality. We evaluated ViLaM's exceptional performance on public +general datasets and further confirmed its generalizability on medical +datasets. Importantly, we've observed the model's impressive zero-shot learning +ability, indicating the potential future application of ViLaM in the medical +field. + +
+
+
+
+
+ + ☆ Overcoming Pathology Image Data Deficiency: Generating Images from + Pathological Transformation Process + + +
+ Histopathology serves as the gold standard for medical diagnosis but faces +application limitations due to the shortage of medical resources. Leveraging +deep learning, computer-aided diagnosis has the potential to alleviate the +pathologist scarcity and provide timely clinical analysis. However, developing +a reliable model generally necessitates substantial data for training, which is +challenging in pathological field. In response, we propose an adaptive +depth-controlled bidirectional diffusion (ADBD) network for image data +generation. The domain migration approach can work with small trainset and +overcome the diffusion overfitting by source information guidance. +Specifically, we developed a hybrid attention strategy to blend global and +local attention priorities, which guides the bidirectional diffusion and +ensures the migration success. In addition, we developed the adaptive +depth-controlled strategy to simulate physiological transformations, capable of +yielding unlimited cross-domain intermediate images with corresponding soft +labels. ADBD is effective for overcoming pathological image data deficiency and +supportable for further pathology-related research. + +
+
+
+
+
+ + ☆ ABFL: Angular Boundary Discontinuity Free Loss for Arbitrary Oriented + Object Detection in Aerial Images + + +
+ Arbitrary oriented object detection (AOOD) in aerial images is a widely +concerned and highly challenging task, and plays an important role in many +scenarios. The core of AOOD involves the representation, encoding, and feature +augmentation of oriented bounding-boxes (Bboxes). Existing methods lack +intuitive modeling of angle difference measurement in oriented Bbox +representations. Oriented Bboxes under different representations exhibit +rotational symmetry with varying periods due to angle periodicity. The angular +boundary discontinuity (ABD) problem at periodic boundary positions is caused +by rotational symmetry in measuring angular differences. In addition, existing +methods also use additional encoding-decoding structures for oriented Bboxes. +In this paper, we design an angular boundary free loss (ABFL) based on the von +Mises distribution. The ABFL aims to solve the ABD problem when detecting +oriented objects. Specifically, ABFL proposes to treat angles as circular data +rather than linear data when measuring angle differences, aiming to introduce +angle periodicity to alleviate the ABD problem and improve the accuracy of +angle difference measurement. In addition, ABFL provides a simple and effective +solution for various periodic boundary discontinuities caused by rotational +symmetry in AOOD tasks, as it does not require additional encoding-decoding +structures for oriented Bboxes. Extensive experiments on the DOTA and HRSC2016 +datasets show that the proposed ABFL loss outperforms some state-of-the-art +methods focused on addressing the ABD problem. + +
+
+
+
+
+ + ☆ Challenges in Video-Based Infant Action Recognition: A Critical + Examination of the State of the Art + + +
+ Automated human action recognition, a burgeoning field within computer +vision, boasts diverse applications spanning surveillance, security, +human-computer interaction, tele-health, and sports analysis. Precise action +recognition in infants serves a multitude of pivotal purposes, encompassing +safety monitoring, developmental milestone tracking, early intervention for +developmental delays, fostering parent-infant bonds, advancing computer-aided +diagnostics, and contributing to the scientific comprehension of child +development. This paper delves into the intricacies of infant action +recognition, a domain that has remained relatively uncharted despite the +accomplishments in adult action recognition. In this study, we introduce a +groundbreaking dataset called ``InfActPrimitive'', encompassing five +significant infant milestone action categories, and we incorporate specialized +preprocessing for infant data. We conducted an extensive comparative analysis +employing cutting-edge skeleton-based action recognition models using this +dataset. Our findings reveal that, although the PoseC3D model achieves the +highest accuracy at approximately 71%, the remaining models struggle to +accurately capture the dynamics of infant actions. This highlights a +substantial knowledge gap between infant and adult action recognition domains +and the urgent need for data-efficient pipeline models. + +
+
+
+
+
+ + ☆ Instance-aware 3D Semantic Segmentation powered by Shape Generators and + Classifiers + + +
+ Existing 3D semantic segmentation methods rely on point-wise or voxel-wise +feature descriptors to output segmentation predictions. However, these +descriptors are often supervised at point or voxel level, leading to +segmentation models that can behave poorly at instance-level. In this paper, we +proposed a novel instance-aware approach for 3D semantic segmentation. Our +method combines several geometry processing tasks supervised at instance-level +to promote the consistency of the learned feature representation. Specifically, +our methods use shape generators and shape classifiers to perform shape +reconstruction and classification tasks for each shape instance. This enforces +the feature representation to faithfully encode both structural and local shape +information, with an awareness of shape instances. In the experiments, our +method significantly outperform existing approaches in 3D semantic segmentation +on several public benchmarks, such as Waymo Open Dataset, SemanticKITTI and +ScanNetV2. + +
+
+
+
+
+ + ☆ Procedural Generation of Grain Orientations using the Wave Function + Collapse Algorithm + + +
+ Statistics of grain sizes and orientations in metals correlate to the +material's mechanical properties. Reproducing representative volume elements +for further analysis of deformation and failure in metals, like 316L stainless +steel, is particularly important due to their wide use in manufacturing goods +today. Two approaches, initially created for video games, were considered for +the procedural generation of representative grain microstructures. The first is +the Wave Function Collapse (WFC) algorithm, and the second is constraint +propagation and probabilistic inference through Markov Junior, a free and +open-source software. This study aimed to investigate these two algorithms' +effectiveness in using reference electron backscatter diffraction (EBSD) maps +and recreating a statistically similar one that could be used in further +research. It utilized two stainless steel EBSD maps as references to test both +algorithms. First, the WFC algorithm was too constricting and, thus, incapable +of producing images that resembled EBSDs. The second, MarkovJunior, was much +more effective in creating a Voronoi tessellation that could be used to create +an EBSD map in Python. When comparing the results between the reference and the +generated EBSD, we discovered that the orientation and volume fractions were +extremely similar. With the study, it was concluded that MarkovJunior is an +effective machine learning tool that can reproduce representative grain +microstructures. + +
+
+ comment: 6 pages, 18 figures +
+
+
+
+
+ + ☆ Boosting Audio-visual Zero-shot Learning with Large Language Models + + +
+ Audio-visual zero-shot learning aims to recognize unseen categories based on +paired audio-visual sequences. Recent methods mainly focus on learning aligned +and discriminative multi-modal features to boost generalization towards unseen +categories. However, these approaches ignore the obscure action concepts in +category names and may inevitably introduce complex network structures with +difficult training objectives. In this paper, we propose a simple yet effective +framework named Knowledge-aware Distribution Adaptation (KDA) to help the model +better grasp the novel action contents with an external knowledge base. +Specifically, we first propose using large language models to generate rich +descriptions from category names, which leads to a better understanding of +unseen categories. Additionally, we propose a distribution alignment loss as +well as a knowledge-aware adaptive margin loss to further improve the +generalization ability towards unseen categories. Extensive experimental +results demonstrate that our proposed KDA can outperform state-of-the-art +methods on three popular audio-visual zero-shot learning datasets. Our code +will be avaliable at \url{https://github.com/chenhaoxing/KDA}. + +
+
+
+
+
+ + ☆ Virtual Home Staging: Inverse Rendering and Editing an Indoor Panorama + under Natural Illumination + + +
+ We propose a novel inverse rendering method that enables the transformation +of existing indoor panoramas with new indoor furniture layouts under natural +illumination. To achieve this, we captured indoor HDR panoramas along with +real-time outdoor hemispherical HDR photographs. Indoor and outdoor HDR images +were linearly calibrated with measured absolute luminance values for accurate +scene relighting. Our method consists of three key components: (1) panoramic +furniture detection and removal, (2) automatic floor layout design, and (3) +global rendering with scene geometry, new furniture objects, and a real-time +outdoor photograph. We demonstrate the effectiveness of our workflow in +rendering indoor scenes under different outdoor illumination conditions. +Additionally, we contribute a new calibrated HDR (Cali-HDR) dataset that +consists of 137 calibrated indoor panoramas and their associated outdoor +photographs. The source code and dataset are available: +https://github.com/Gzhji/Cali-HDR-Dataset. + +
+
+
+
+
+ + ☆ Novel OCT mosaicking pipeline with Feature- and Pixel-based registration + + +
+ High-resolution Optical Coherence Tomography (OCT) images are crucial for +ophthalmology studies but are limited by their relatively narrow field of view +(FoV). Image mosaicking is a technique for aligning multiple overlapping images +to obtain a larger FoV. Current mosaicking pipelines often struggle with +substantial noise and considerable displacement between the input sub-fields. +In this paper, we propose a versatile pipeline for stitching multi-view +OCT/OCTA \textit{en face} projection images. Our method combines the strengths +of learning-based feature matching and robust pixel-based registration to align +multiple images effectively. Furthermore, we advance the application of a +trained foundational model, Segment Anything Model (SAM), to validate +mosaicking results in an unsupervised manner. The efficacy of our pipeline is +validated using an in-house dataset and a large public dataset, where our +method shows superior performance in terms of both accuracy and computational +efficiency. We also made our evaluation tool for image mosaicking and the +corresponding pipeline publicly available at +\url{https://github.com/MedICL-VU/OCT-mosaicking}. + +
+
+
+
+
+ + ☆ Camera-Independent Single Image Depth Estimation from Defocus Blur + + +
+ Monocular depth estimation is an important step in many downstream tasks in +machine vision. We address the topic of estimating monocular depth from defocus +blur which can yield more accurate results than the semantic based depth +estimation methods. The existing monocular depth from defocus techniques are +sensitive to the particular camera that the images are taken from. We show how +several camera-related parameters affect the defocus blur using optical physics +equations and how they make the defocus blur depend on these parameters. The +simple correction procedure we propose can alleviate this problem which does +not require any retraining of the original model. We created a synthetic +dataset which can be used to test the camera independent performance of depth +from defocus blur models. We evaluate our model on both synthetic and real +datasets (DDFF12 and NYU depth V2) obtained with different cameras and show +that our methods are significantly more robust to the changes of cameras. Code: +https://github.com/sleekEagle/defocus_camind.git + +
+
+
+
+
+ + ☆ Unsupervised Multimodal Surface Registration with Geometric Deep + Learning + + +
+ This paper introduces GeoMorph, a novel geometric deep-learning framework +designed for image registration of cortical surfaces. The registration process +consists of two main steps. First, independent feature extraction is performed +on each input surface using graph convolutions, generating low-dimensional +feature representations that capture important cortical surface +characteristics. Subsequently, features are registered in a deep-discrete +manner to optimize the overlap of common structures across surfaces by learning +displacements of a set of control points. To ensure smooth and biologically +plausible deformations, we implement regularization through a deep conditional +random field implemented with a recurrent neural network. Experimental results +demonstrate that GeoMorph surpasses existing deep-learning methods by achieving +improved alignment with smoother deformations. Furthermore, GeoMorph exhibits +competitive performance compared to classical frameworks. Such versatility and +robustness suggest strong potential for various neuroscience applications. + +
+
+
+
+
+ + ☆ Attention: Large Multimodal Model is Watching your Geo-privacy + + +
+ Geographic privacy, a crucial aspect of personal security, often goes +unnoticed in daily activities. This paper addresses the underestimation of this +privacy in the context of increasing online data sharing and the advancements +in information gathering technologies. With the surge in the use of Large +Multimodal Models, such as GPT-4, for Open Source Intelligence (OSINT), the +potential risks associated with geographic privacy breaches have intensified. +This study highlights the criticality of these developments, focusing on their +implications for individual privacy. The primary objective is to demonstrate +the capabilities of advanced AI tools, specifically a GPT-4 based model named +"Dr. Watson," in identifying and potentially compromising geographic privacy +through online shared content. We developed "Dr. Watson" to analyze and extract +geographic information from publicly available data sources. The study involved +five experimental cases, each offering different perspectives on the tool's +application in extracting precise location data from partial images and social +media content. The experiments revealed that "Dr. Watson" could successfully +identify specific geographic details, thereby exposing the vulnerabilities in +current geo-privacy measures. These findings underscore the ease with which +geographic information can be unintentionally disclosed. The paper concludes +with a discussion on the broader implications of these findings for individuals +and the community at large. It emphasizes the urgency for enhanced awareness +and protective measures against geo-privacy leakage in the era of advanced AI +and widespread social media usage. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Image-Based Soil Organic Carbon Remote Sensing from Satellite Images + with Fourier Neural Operator and Structural Similarity + + +
+ Soil organic carbon (SOC) sequestration is the transfer and storage of +atmospheric carbon dioxide in soils, which plays an important role in climate +change mitigation. SOC concentration can be improved by proper land use, thus +it is beneficial if SOC can be estimated at a regional or global scale. As +multispectral satellite data can provide SOC-related information such as +vegetation and soil properties at a global scale, estimation of SOC through +satellite data has been explored as an alternative to manual soil sampling. +Although existing studies show promising results, they are mainly based on +pixel-based approaches with traditional machine learning methods, and +convolutional neural networks (CNNs) are uncommon. To study the use of CNNs on +SOC remote sensing, here we propose the FNO-DenseNet based on the Fourier +neural operator (FNO). By combining the advantages of the FNO and DenseNet, the +FNO-DenseNet outperformed the FNO in our experiments with hundreds of times +fewer parameters. The FNO-DenseNet also outperformed a pixel-based random +forest by 18% in the mean absolute percentage error. + +
+
+ comment: This paper was accepted by the 2023 IEEE International Geoscience and + Remote Sensing Symposium (IGARSS 2023) +
+
+
+
+
+ + ☆ 3D Compression Using Neural Fields + + +
+ Neural Fields (NFs) have gained momentum as a tool for compressing various +data modalities - e.g. images and videos. This work leverages previous advances +and proposes a novel NF-based compression algorithm for 3D data. We derive two +versions of our approach - one tailored to watertight shapes based on Signed +Distance Fields (SDFs) and, more generally, one for arbitrary non-watertight +shapes using Unsigned Distance Fields (UDFs). We demonstrate that our method +excels at geometry compression on 3D point clouds as well as meshes. Moreover, +we show that, due to the NF formulation, it is straightforward to extend our +compression algorithm to compress both geometry and attribute (e.g. color) of +3D data. + +
+
+
+
+
+ + ☆ AI for Agriculture: the Comparison of Semantic Segmentation Methods for + Crop Mapping with Sentinel-2 Imagery + + +
+ Crop mapping is one of the most common tasks in artificial intelligence for +agriculture due to higher food demands from a growing population and increased +awareness of climate change. In case of vineyards, the texture is very +important for crop segmentation: with higher resolution satellite imagery the +texture is easily detected by majority of state-of-the-art algorithms. However, +this task becomes increasingly more difficult as the resolution of satellite +imagery decreases and the information about the texture becomes unavailable. In +this paper we aim to explore the main machine learning methods that can be used +with freely available satellite imagery and discuss how and when they can be +applied for vineyard segmentation problem. We assess the effectiveness of +various widely-used machine learning techniques and offer guidance on selecting +the most suitable model for specific scenarios. + +
+
+
+
+
+ + ☆ FollowMe: a Robust Person Following Framework Based on Re-Identification + and Gestures + + +
+ Human-robot interaction (HRI) has become a crucial enabler in houses and +industries for facilitating operational flexibility. When it comes to mobile +collaborative robots, this flexibility can be further increased due to the +autonomous mobility and navigation capacity of the robotic agents, expanding +their workspace and consequently, the personalizable assistance they can +provide to the human operators. This however requires that the robot is capable +of detecting and identifying the human counterpart in all stages of the +collaborative task, and in particular while following a human in crowded +workplaces. To respond to this need, we developed a unified perception and +navigation framework, which enables the robot to identify and follow a target +person using a combination of visual Re-Identification (Re-ID), hand gestures +detection, and collision-free navigation. The Re-ID module can autonomously +learn the features of a target person and use the acquired knowledge to +visually re-identify the target. The navigation stack is used to follow the +target avoiding obstacles and other individuals in the environment. Experiments +are conducted with few subjects in a laboratory setting where some unknown +dynamic obstacles are introduced. + +
+
+ comment: published in "2023 IEEE International Conference on Advanced Robotics + and Its Social Impacts (ARSO)" +
+
+
+
+
+ + ☆ SD-NAE: Generating Natural Adversarial Examples with Stable Diffusion + + +
+ Robustly evaluating deep learning image classifiers is challenging due to +some limitations of standard datasets. Natural Adversarial Examples (NAEs), +arising naturally from the environment and capable of deceiving classifiers, +are instrumental in identifying vulnerabilities in trained models. Existing +works collect such NAEs by filtering from a huge set of real images, a process +that is passive and lacks control. In this work, we propose to actively +synthesize NAEs with the state-of-the-art Stable Diffusion. Specifically, our +method formulates a controlled optimization process, where we perturb the token +embedding that corresponds to a specified class to synthesize NAEs. The +generation is guided by the gradient of loss from the target classifier so that +the created image closely mimics the ground-truth class yet fools the +classifier. Named SD-NAE (Stable Diffusion for Natural Adversarial Examples), +our innovative method is effective in producing valid and useful NAEs, which is +demonstrated through a meticulously designed experiment. Our work thereby +provides a valuable method for obtaining challenging evaluation data, which in +turn can potentially advance the development of more robust deep learning +models. Code is available at https://github.com/linyueqian/SD-NAE. + +
+
+
+
+
+ + ☆ Robustifying Generalizable Implicit Shape Networks with a Tunable + Non-Parametric Model NeurIPS 2023 + + +
+ Feedforward generalizable models for implicit shape reconstruction from +unoriented point cloud present multiple advantages, including high performance +and inference speed. However, they still suffer from generalization issues, +ranging from underfitting the input point cloud, to misrepresenting samples +outside of the training data distribution, or with toplogies unseen at +training. We propose here an efficient mechanism to remedy some of these +limitations at test time. We combine the inter-shape data prior of the network +with an intra-shape regularization prior of a Nystr\"om Kernel Ridge +Regression, that we further adapt by fitting its hyperprameters to the current +shape. The resulting shape function defined in a shape specific Reproducing +Kernel Hilbert Space benefits from desirable stability and efficiency +properties and grants a shape adaptive expressiveness-robustness trade-off. We +demonstrate the improvement obtained through our method with respect to +baselines and the state-of-the-art using synthetic and real data. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ Innovative Horizons in Aerial Imagery: LSKNet Meets DiffusionDet for + Advanced Object Detection + + +
+ In the realm of aerial image analysis, object detection plays a pivotal role, +with significant implications for areas such as remote sensing, urban planning, +and disaster management. This study addresses the inherent challenges in this +domain, notably the detection of small objects, managing densely packed +elements, and accounting for diverse orientations. We present an in-depth +evaluation of an object detection model that integrates the Large Selective +Kernel Network (LSKNet)as its backbone with the DiffusionDet head, utilizing +the iSAID dataset for empirical analysis. Our approach encompasses the +introduction of novel methodologies and extensive ablation studies. These +studies critically assess various aspects such as loss functions, box +regression techniques, and classification strategies to refine the model's +precision in object detection. The paper details the experimental application +of the LSKNet backbone in synergy with the DiffusionDet heads, a combination +tailored to meet the specific challenges in aerial image object detection. The +findings of this research indicate a substantial enhancement in the model's +performance, especially in the accuracy-time tradeoff. The proposed model +achieves a mean average precision (MAP) of approximately 45.7%, which is a +significant improvement, outperforming the RCNN model by 4.7% on the same +dataset. This advancement underscores the effectiveness of the proposed +modifications and sets a new benchmark in aerial image analysis, paving the way +for more accurate and efficient object detection methodologies. The code is +publicly available at https://github.com/SashaMatsun/LSKDiffDet + +
+
+
+
+
+ + ☆ SPOT! Revisiting Video-Language Models for Event Understanding + + +
+ Understanding videos is an important research topic for multimodal learning. +Leveraging large-scale datasets of web-crawled video-text pairs as weak +supervision has become a pre-training paradigm for learning joint +representations and showcased remarkable potential in video understanding +tasks. However, videos can be multi-event and multi-grained, while these +video-text pairs usually contain only broad-level video captions. This raises a +question: with such weak supervision, can video representation in +video-language models gain the ability to distinguish even factual +discrepancies in textual description and understand fine-grained events? To +address this, we introduce SPOT Prober, to benchmark existing video-language +models's capacities of distinguishing event-level discrepancies as an indicator +of models' event understanding ability. Our approach involves extracting events +as tuples () from videos and +generating false event tuples by manipulating tuple components systematically. +We reevaluate the existing video-language models with these positive and +negative captions and find they fail to distinguish most of the manipulated +events. Based on our findings, we propose to plug in these manipulated event +captions as hard negative samples and find them effective in enhancing models +for event understanding. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ Attention Deficit is Ordered! Fooling Deformable Vision Transformers + with Collaborative Adversarial Patches + + +
+ The latest generation of transformer-based vision models have proven to be +superior to Convolutional Neural Network (CNN)-based models across several +vision tasks, largely attributed to their remarkable prowess in relation +modeling. Deformable vision transformers significantly reduce the quadratic +complexity of modeling attention by using sparse attention structures, enabling +them to be used in larger scale applications such as multi-view vision systems. +Recent work demonstrated adversarial attacks against transformers; we show that +these attacks do not transfer to deformable transformers due to their sparse +attention structure. Specifically, attention in deformable transformers is +modeled using pointers to the most relevant other tokens. In this work, we +contribute for the first time adversarial attacks that manipulate the attention +of deformable transformers, distracting them to focus on irrelevant parts of +the image. We also develop new collaborative attacks where a source patch +manipulates attention to point to a target patch that adversarially attacks the +system. In our experiments, we find that only 1% patched area of the input +field can lead to 0% AP. We also show that the attacks provide substantial +versatility to support different attacker scenarios because of their ability to +redirect attention under the attacker control. + +
+
+ comment: 9 pages, 10 figures +
+
+
+
+
+ + ☆ Q-Seg: Quantum Annealing-based Unsupervised Image Segmentation + + +
+ In this study, we present Q-Seg, a novel unsupervised image segmentation +method based on quantum annealing, tailored for existing quantum hardware. We +formulate the pixel-wise segmentation problem, which assimilates spectral and +spatial information of the image, as a graph-cut optimization task. Our method +efficiently leverages the interconnected qubit topology of the D-Wave Advantage +device, offering superior scalability over existing quantum approaches and +outperforming state-of-the-art classical methods. Our empirical evaluations on +synthetic datasets reveal that Q-Seg offers better runtime performance against +the classical optimizer Gurobi. Furthermore, we evaluate our method on +segmentation of Earth Observation images, an area of application where the +amount of labeled data is usually very limited. In this case, Q-Seg +demonstrates near-optimal results in flood mapping detection with respect to +classical supervised state-of-the-art machine learning methods. Also, Q-Seg +provides enhanced segmentation for forest coverage compared to existing +annotated masks. Thus, Q-Seg emerges as a viable alternative for real-world +applications using available quantum hardware, particularly in scenarios where +the lack of labeled data and computational runtime are critical. + +
+
+ comment: 12 pages, 9 figures, 1 table +
+
+
+
+
+ + ☆ Diffusion Model Alignment Using Direct Preference Optimization + + +
+ Large language models (LLMs) are fine-tuned using human comparison data with +Reinforcement Learning from Human Feedback (RLHF) methods to make them better +aligned with users' preferences. In contrast to LLMs, human preference learning +has not been widely explored in text-to-image diffusion models; the best +existing approach is to fine-tune a pretrained model using carefully curated +high quality images and captions to improve visual appeal and text alignment. +We propose Diffusion-DPO, a method to align diffusion models to human +preferences by directly optimizing on human comparison data. Diffusion-DPO is +adapted from the recently developed Direct Preference Optimization (DPO), a +simpler alternative to RLHF which directly optimizes a policy that best +satisfies human preferences under a classification objective. We re-formulate +DPO to account for a diffusion model notion of likelihood, utilizing the +evidence lower bound to derive a differentiable objective. Using the Pick-a-Pic +dataset of 851K crowdsourced pairwise preferences, we fine-tune the base model +of the state-of-the-art Stable Diffusion XL (SDXL)-1.0 model with +Diffusion-DPO. Our fine-tuned base model significantly outperforms both base +SDXL-1.0 and the larger SDXL-1.0 model consisting of an additional refinement +model in human evaluation, improving visual appeal and prompt alignment. We +also develop a variant that uses AI feedback and has comparable performance to +training on human preferences, opening the door for scaling of diffusion model +alignment methods. + +
+
+
+
+
+ + ♻ ☆ MaGIC: Multi-modality Guided Image Completion + + +
+ Vanilla image completion approaches exhibit sensitivity to large missing +regions, attributed to the limited availability of reference information for +plausible generation. To mitigate this, existing methods incorporate the extra +cue as a guidance for image completion. Despite improvements, these approaches +are often restricted to employing a single modality (e.g., segmentation or +sketch maps), which lacks scalability in leveraging multi-modality for more +plausible completion. In this paper, we propose a novel, simple yet effective +method for Multi-modal Guided Image Completion, dubbed MaGIC, which not only +supports a wide range of single modality as the guidance (e.g., text, canny +edge, sketch, segmentation, depth, and pose), but also adapts to arbitrarily +customized combination of these modalities (i.e., arbitrary multi-modality) for +image completion. For building MaGIC, we first introduce a modality-specific +conditional U-Net (MCU-Net) that injects single-modal signal into a U-Net +denoiser for single-modal guided image completion. Then, we devise a consistent +modality blending (CMB) method to leverage modality signals encoded in multiple +learned MCU-Nets through gradient guidance in latent space. Our CMB is +training-free, thereby avoids the cumbersome joint re-training of different +modalities, which is the secret of MaGIC to achieve exceptional flexibility in +accommodating new modalities for completion. Experiments show the superiority +of MaGIC over state-of-the-art methods and its generalization to various +completion tasks. Our project with code and models is available at +yeates.github.io/MaGIC-Page/. + +
+
+ comment: 23 pages, 15 figures +
+
+
+
+
+ + ♻ ☆ GEFF: Improving Any Clothes-Changing Person ReID Model using Gallery + Enrichment with Face Features + + +
+ In the Clothes-Changing Re-Identification (CC-ReID) problem, given a query +sample of a person, the goal is to determine the correct identity based on a +labeled gallery in which the person appears in different clothes. Several +models tackle this challenge by extracting clothes-independent features. +However, the performance of these models is still lower for the +clothes-changing setting compared to the same-clothes setting in which the +person appears with the same clothes in the labeled gallery. As +clothing-related features are often dominant features in the data, we propose a +new process we call Gallery Enrichment, to utilize these features. In this +process, we enrich the original gallery by adding to it query samples based on +their face features, using an unsupervised algorithm. Additionally, we show +that combining ReID and face feature extraction modules alongside an enriched +gallery results in a more accurate ReID model, even for query samples with new +outfits that do not include faces. Moreover, we claim that existing CC-ReID +benchmarks do not fully represent real-world scenarios, and propose a new video +CC-ReID dataset called 42Street, based on a theater play that includes crowded +scenes and numerous clothes changes. When applied to multiple ReID models, our +method (GEFF) achieves an average improvement of 33.5% and 6.7% in the Top-1 +clothes-changing metric on the PRCC and LTCC benchmarks. Combined with the +latest ReID models, our method achieves new SOTA results on the PRCC, LTCC, +CCVID, LaST and VC-Clothes benchmarks and the proposed 42Street dataset. + +
+
+
+
+
+ + ♻ ☆ Task Arithmetic in the Tangent Space: Improved Editing of Pre-Trained + Models + + +
+ Task arithmetic has recently emerged as a cost-effective and scalable +approach to edit pre-trained models directly in weight space: By adding the +fine-tuned weights of different tasks, the model's performance can be improved +on these tasks, while negating them leads to task forgetting. Yet, our +understanding of the effectiveness of task arithmetic and its underlying +principles remains limited. We present a comprehensive study of task arithmetic +in vision-language models and show that weight disentanglement is the crucial +factor that makes it effective. This property arises during pre-training and +manifests when distinct directions in weight space govern separate, localized +regions in function space associated with the tasks. Notably, we show that +fine-tuning models in their tangent space by linearizing them amplifies weight +disentanglement. This leads to substantial performance improvements across +multiple task arithmetic benchmarks and diverse models. Building on these +findings, we provide theoretical and empirical analyses of the neural tangent +kernel (NTK) of these models and establish a compelling link between task +arithmetic and the spatial localization of the NTK eigenfunctions. Overall, our +work uncovers novel insights into the fundamental mechanisms of task arithmetic +and offers a more reliable and effective approach to edit pre-trained models +through the NTK linearization. + +
+
+
+
+
+ + ♻ ☆ Learning to Aggregate Multi-Scale Context for Instance Segmentation in + Remote Sensing Images + + +
+ The task of instance segmentation in remote sensing images, aiming at +performing per-pixel labeling of objects at instance level, is of great +importance for various civil applications. Despite previous successes, most +existing instance segmentation methods designed for natural images encounter +sharp performance degradations when they are directly applied to top-view +remote sensing images. Through careful analysis, we observe that the challenges +mainly come from the lack of discriminative object features due to severe scale +variations, low contrasts, and clustered distributions. In order to address +these problems, a novel context aggregation network (CATNet) is proposed to +improve the feature extraction process. The proposed model exploits three +lightweight plug-and-play modules, namely dense feature pyramid network +(DenseFPN), spatial context pyramid (SCP), and hierarchical region of interest +extractor (HRoIE), to aggregate global visual context at feature, spatial, and +instance domains, respectively. DenseFPN is a multi-scale feature propagation +module that establishes more flexible information flows by adopting inter-level +residual connections, cross-level dense connections, and feature re-weighting +strategy. Leveraging the attention mechanism, SCP further augments the features +by aggregating global spatial context into local regions. For each instance, +HRoIE adaptively generates RoI features for different downstream tasks. +Extensive evaluations of the proposed scheme on iSAID, DIOR, NWPU VHR-10, and +HRSID datasets demonstrate that the proposed approach outperforms +state-of-the-arts under similar computational costs. Source code and +pre-trained models are available at https://github.com/yeliudev/CATNet. + +
+
+ comment: Accepted to IEEE Transactions on Neural Networks and Learning Systems + (TNNLS), 2023 +
+
+
+
+
+ + ♻ ☆ VeriCompress: A Tool to Streamline the Synthesis of Verified Robust + Compressed Neural Networks from Scratch + + +
+ AI's widespread integration has led to neural networks (NNs) deployment on +edge and similar limited-resource platforms for safety-critical scenarios. Yet, +NN's fragility raises concerns about reliable inference. Moreover, constrained +platforms demand compact networks. This study introduces VeriCompress, a tool +that automates the search and training of compressed models with robustness +guarantees. These models are well-suited for safety-critical applications and +adhere to predefined architecture and size limitations, making them deployable +on resource-restricted platforms. The method trains models 2-3 times faster +than the state-of-the-art approaches, surpassing relevant baseline approaches +by average accuracy and robustness gains of 15.1 and 9.8 percentage points, +respectively. When deployed on a resource-restricted generic platform, these +models require 5-8 times less memory and 2-4 times less inference time than +models used in verified robustness literature. Our comprehensive evaluation +across various model architectures and datasets, including MNIST, CIFAR, SVHN, +and a relevant pedestrian detection dataset, showcases VeriCompress's capacity +to identify compressed verified robust models with reduced computation overhead +compared to current standards. This underscores its potential as a valuable +tool for end users, such as developers of safety-critical applications on edge +or Internet of Things platforms, empowering them to create suitable models for +safety-critical, resource-constrained platforms in their respective domains. + +
+
+ comment: 9 pages, 5 tables, 2 figures +
+
+
+
+
+ + ♻ ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models + + +
+ As the cost associated with fine-tuning Large Language Models (LLMs) +continues to rise, recent research efforts have pivoted towards developing +methodologies to edit implicit knowledge embedded within LLMs. Yet, there's +still a dark cloud lingering overhead -- will knowledge editing trigger +butterfly effect? since it is still unclear whether knowledge editing might +introduce side effects that pose potential risks or not. This paper pioneers +the investigation into the potential pitfalls associated with knowledge editing +for LLMs. To achieve this, we introduce new benchmark datasets and propose +innovative evaluation metrics. Our results underline two pivotal concerns: (1) +Knowledge Conflict: Editing groups of facts that logically clash can magnify +the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2) +Knowledge Distortion: Altering parameters with the aim of editing factual +knowledge can irrevocably warp the innate knowledge structure of LLMs. +Experimental results vividly demonstrate that knowledge editing might +inadvertently cast a shadow of unintended consequences on LLMs, which warrant +attention and efforts for future works. Code is available at +https://github.com/zjunlp/PitfallsKnowledgeEditing. + +
+
+ comment: Work in progress, add more experiments +
+
+
+
+
+ + ♻ ☆ An Automated Pipeline for Tumour-Infiltrating Lymphocyte Scoring in + Breast Cancer + + +
+ Tumour-infiltrating lymphocytes (TILs) are considered as a valuable +prognostic markers in both triple-negative and human epidermal growth factor +receptor 2 (HER2) positive breast cancer. In this study, we introduce an +innovative deep learning pipeline based on the Efficient-UNet architecture to +predict the TILs score for breast cancer whole-slide images (WSIs). We first +segment tumour and stromal regions in order to compute a tumour bulk mask. We +then detect TILs within the tumour-associated stroma, generating a TILs score +by closely mirroring the pathologist's workflow. Our method exhibits +state-of-the-art performance in segmenting tumour/stroma areas and TILs +detection, as demonstrated by internal cross-validation on the TiGER Challenge +training dataset and evaluation on the final leaderboards. Additionally, our +TILs score proves competitive in predicting survival outcomes within the same +challenge, underscoring the clinical relevance and potential of our automated +TILs scoring pipeline as a breast cancer prognostic tool. + +
+
+ comment: 5 pages, 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ Using Scalable Computer Vision to Automate High-throughput Semiconductor + Characterization + + +
+ High-throughput materials synthesis methods have risen in popularity due to +their potential to accelerate the design and discovery of novel functional +materials, such as solution-processed semiconductors. After synthesis, key +material properties must be measured and characterized to validate discovery +and provide feedback to optimization cycles. However, with the boom in +development of high-throughput synthesis tools that champion production rates +up to $10^4$ samples per hour with flexible form factors, most sample +characterization methods are either slow (conventional rates of $10^1$ samples +per hour, approximately 1000x slower) or rigid (e.g., designed for +standard-size microplates), resulting in a bottleneck that impedes the +materials-design process. To overcome this challenge, we propose a set of +automated material property characterization (autocharacterization) tools that +leverage the adaptive, parallelizable, and scalable nature of computer vision +to accelerate the throughput of characterization by 85x compared to the +non-automated workflow. We demonstrate a generalizable composition mapping tool +for high-throughput synthesized binary material systems as well as two scalable +autocharacterization algorithms that (1) autonomously compute the band gap of +200 unique compositions in 6 minutes and (2) autonomously compute the degree of +degradation in 200 unique compositions in 20 minutes, generating ultra-high +compositional resolution trends of band gap and stability. We demonstrate that +the developed band gap and degradation detection autocharacterization methods +achieve 98.5% accuracy and 96.9% accuracy, respectively, on the +FA$_{1-x}$MA$_{x}$PbI$_3$, $0\leq x \leq 1$ perovskite semiconductor system. + +
+
+ comment: Manuscript 18 pages; Supplemental 20 pages +
+
+
+
+
+ + ♻ ☆ XPert: Peripheral Circuit & Neural Architecture Co-search for Area and + Energy-efficient Xbar-based Computing + + +
+ The hardware-efficiency and accuracy of Deep Neural Networks (DNNs) +implemented on In-memory Computing (IMC) architectures primarily depend on the +DNN architecture and the peripheral circuit parameters. It is therefore +essential to holistically co-search the network and peripheral parameters to +achieve optimal performance. To this end, we propose XPert, which co-searches +network architecture in tandem with peripheral parameters such as the type and +precision of analog-to-digital converters, crossbar column sharing and the +layer-specific input precision using an optimization-based design space +exploration. Compared to VGG16 baselines, XPert achieves 10.24x (4.7x) lower +EDAP, 1.72x (1.62x) higher TOPS/W,1.93x (3x) higher TOPS/mm2 at 92.46% (56.7%) +accuracy for CIFAR10 (TinyImagenet) datasets. The code for this paper is +available at https://github.com/Intelligent-Computing-Lab-Yale/XPert. + +
+
+ comment: Accepted to Design and Automation Conference (DAC) +
+
+
+
+
+ + ♻ ☆ WEAR: An Outdoor Sports Dataset for Wearable and Egocentric Activity + Recognition + + +
+ Though research has shown the complementarity of camera- and inertial-based +data, datasets which offer both egocentric video and inertial-based sensor data +remain scarce. In this paper, we introduce WEAR, an outdoor sports dataset for +both vision- and inertial-based human activity recognition (HAR). The dataset +comprises data from 18 participants performing a total of 18 different workout +activities with untrimmed inertial (acceleration) and camera (egocentric video) +data recorded at 10 different outside locations. Unlike previous egocentric +datasets, WEAR provides a challenging prediction scenario marked by purposely +introduced activity variations as well as an overall small information overlap +across modalities. Benchmark results obtained using each modality separately +show that each modality interestingly offers complementary strengths and +weaknesses in their prediction performance. Further, in light of the recent +success of temporal action localization models following the architecture +design of the ActionFormer, we demonstrate their versatility by applying them +in a plain fashion using vision, inertial and combined (vision + inertial) +features as input. Results demonstrate both the applicability of vision-based +temporal action localization models for inertial data and fusing both +modalities by means of simple concatenation, with the combined approach (vision ++ inertial features) being able to produce the highest mean average precision +and close-to-best F1-score. The dataset and code to reproduce experiments is +publicly available via: https://mariusbock.github.io/wear/ + +
+
+ comment: 15 pages, 3 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Detect Every Thing with Few Examples + + +
+ Open-set object detection aims at detecting arbitrary categories beyond those +seen during training. Most recent advancements have adopted the open-vocabulary +paradigm, utilizing vision-language backbones to represent categories with +language. In this paper, we introduce DE-ViT, an open-set object detector that +employs vision-only DINOv2 backbones and learns new categories through example +images instead of language. To improve general detection ability, we transform +multi-classification tasks into binary classification tasks while bypassing +per-class inference, and propose a novel region propagation technique for +localization. We evaluate DE-ViT on open-vocabulary, few-shot, and one-shot +object detection benchmark with COCO and LVIS. For COCO, DE-ViT outperforms the +open-vocabulary SoTA by 6.9 AP50 and achieves 50 AP50 in novel classes. DE-ViT +surpasses the few-shot SoTA by 15 mAP on 10-shot and 7.2 mAP on 30-shot and +one-shot SoTA by 2.8 AP50. For LVIS, DE-ViT outperforms the open-vocabulary +SoTA by 2.2 mask AP and reaches 34.3 mask APr. Code is available at +https://github.com/mlzxy/devit. + +
+
+
+
+
+ + ♻ ☆ Differentially Private Optimizers Can Learn Adversarially Robust Models + + +
+ Machine learning models have shone in a variety of domains and attracted +increasing attention from both the security and the privacy communities. One +important yet worrying question is: Will training models under the differential +privacy (DP) constraint have an unfavorable impact on their adversarial +robustness? While previous works have postulated that privacy comes at the cost +of worse robustness, we give the first theoretical analysis to show that DP +models can indeed be robust and accurate, even sometimes more robust than their +naturally-trained non-private counterparts. We observe three key factors that +influence the privacy-robustness-accuracy tradeoff: (1) hyper-parameters for DP +optimizers are critical; (2) pre-training on public data significantly +mitigates the accuracy and robustness drop; (3) choice of DP optimizers makes a +difference. With these factors set properly, we achieve 90\% natural accuracy, +72\% robust accuracy ($+9\%$ than the non-private model) under $l_2(0.5)$ +attack, and 69\% robust accuracy ($+16\%$ than the non-private model) with +pre-trained SimCLRv2 model under $l_\infty(4/255)$ attack on CIFAR10 with +$\epsilon=2$. In fact, we show both theoretically and empirically that DP +models are Pareto optimal on the accuracy-robustness tradeoff. Empirically, the +robustness of DP models is consistently observed across various datasets and +models. We believe our encouraging results are a significant step towards +training models that are private as well as robust. + +
+
+
+
+
+ + ♻ ☆ Influencer Videos: Unboxing the Mystique + + +
+ Influencer marketing has become a very popular tool to reach customers. +Despite the rapid growth in influencer videos, there has been little research +on the effectiveness of their constituent features in explaining video +engagement. We study YouTube influencers and analyze their unstructured video +data across text, audio and images using an "interpretable deep learning" +framework that accomplishes both goals of prediction and interpretation. Our +prediction-based approach analyzes unstructured data and finds that "what is +said" in words (text) is more influential than "how it is said" in imagery +(images) or acoustics (audio). Our novel interpretation-based approach is +implemented after completion of model prediction by analyzing the same source +of unstructured data to measure importance attributed to the video features. We +eliminate several spurious relationships in two steps, identifying a subset of +relationships which are confirmed using theory. We uncover novel findings that +establish distinct associations for measures of shallow and deep engagement +based on the dual-system framework of human thinking. Our approach is validated +using simulated data, and we discuss the learnings from our findings for +influencers and brands. + +
+
+ comment: 45 pages, Online Appendix +
+
+
+
+
+ + ♻ ☆ Decodable and Sample Invariant Continuous Object Encoder + + +
+ We propose Hyper-Dimensional Function Encoding (HDFE). Given samples of a +continuous object (e.g. a function), HDFE produces an explicit vector +representation of the given object, invariant to the sample distribution and +density. Sample distribution and density invariance enables HDFE to +consistently encode continuous objects regardless of their sampling, and +therefore allows neural networks to receive continuous objects as inputs for +machine learning tasks, such as classification and regression. Besides, HDFE +does not require any training and is proved to map the object into an organized +embedding space, which facilitates the training of the downstream tasks. In +addition, the encoding is decodable, which enables neural networks to regress +continuous objects by regressing their encodings. Therefore, HDFE serves as an +interface for processing continuous objects. + We apply HDFE to function-to-function mapping, where vanilla HDFE achieves +competitive performance as the state-of-the-art algorithm. We apply HDFE to +point cloud surface normal estimation, where a simple replacement from PointNet +to HDFE leads to immediate 12% and 15% error reductions in two benchmarks. In +addition, by integrating HDFE into the PointNet-based SOTA network, we improve +the SOTA baseline by 2.5% and 1.7% in the same benchmarks. + +
+
+
+
+
+ + ♻ ☆ Shortcut Learning in Deep Neural Networks + + +
+ Deep learning has triggered the current rise of artificial intelligence and +is the workhorse of today's machine intelligence. Numerous success stories have +rapidly spread all over science, industry and society, but its limitations have +only recently come into focus. In this perspective we seek to distill how many +of deep learning's problems can be seen as different symptoms of the same +underlying problem: shortcut learning. Shortcuts are decision rules that +perform well on standard benchmarks but fail to transfer to more challenging +testing conditions, such as real-world scenarios. Related issues are known in +Comparative Psychology, Education and Linguistics, suggesting that shortcut +learning may be a common characteristic of learning systems, biological and +artificial alike. Based on these observations, we develop a set of +recommendations for model interpretation and benchmarking, highlighting recent +advances in machine learning to improve robustness and transferability from the +lab to real-world applications. + +
+
+ comment: perspective article published at Nature Machine Intelligence + (https://doi.org/10.1038/s42256-020-00257-z) +
+
+
+
+
+ + ♻ ☆ Continual Learning: Applications and the Road Forward + + +
+ Continual learning is a sub-field of machine learning, which aims to allow +machine learning models to continuously learn on new data, by accumulating +knowledge without forgetting what was learned in the past. In this work, we +take a step back, and ask: "Why should one care about continual learning in the +first place?". We set the stage by surveying recent continual learning papers +published at three major machine learning conferences, and show that +memory-constrained settings dominate the field. Then, we discuss five open +problems in machine learning, and even though they seem unrelated to continual +learning at first sight, we show that continual learning will inevitably be +part of their solution. These problems are model-editing, personalization, +on-device learning, faster (re-)training and reinforcement learning. Finally, +by comparing the desiderata from these unsolved problems and the current +assumptions in continual learning, we highlight and discuss four future +directions for continual learning research. We hope that this work offers an +interesting perspective on the future of continual learning, while displaying +its potential value and the paths we have to pursue in order to make it +successful. This work is the result of the many discussions the authors had at +the Dagstuhl seminar on Deep Continual Learning, in March 2023. + +
+
+
+
+
+ + ♻ ☆ SCL-VI: Self-supervised Context Learning for Visual Inspection of + Industrial Defects + + +
+ The unsupervised visual inspection of defects in industrial products poses a +significant challenge due to substantial variations in product surfaces. +Current unsupervised models struggle to strike a balance between detecting +texture and object defects, lacking the capacity to discern latent +representations and intricate features. In this paper, we present a novel +self-supervised learning algorithm designed to derive an optimal encoder by +tackling the renowned jigsaw puzzle. Our approach involves dividing the target +image into nine patches, tasking the encoder with predicting the relative +position relationships between any two patches to extract rich semantics. +Subsequently, we introduce an affinity-augmentation method to accentuate +differences between normal and abnormal latent representations. Leveraging the +classic support vector data description algorithm yields final detection +results. Experimental outcomes demonstrate that our proposed method achieves +outstanding detection and segmentation performance on the widely used MVTec AD +dataset, with rates of 95.8% and 96.8%, respectively, establishing a +state-of-the-art benchmark for both texture and object defects. Comprehensive +experimentation underscores the effectiveness of our approach in diverse +industrial applications. + +
+
+
+
+
+ + ♻ ☆ Video-LLaVA: Learning United Visual Representation by Alignment Before + Projection + + +
+ The Large Vision-Language Model (LVLM) has enhanced the performance of +various downstream tasks in visual-language understanding. Most existing +approaches encode images and videos into separate feature spaces, which are +then fed as inputs to large language models. However, due to the lack of +unified tokenization for images and videos, namely misalignment before +projection, it becomes challenging for a Large Language Model (LLM) to learn +multi-modal interactions from several poor projection layers. In this work, we +unify visual representation into the language feature space to advance the +foundational LLM towards a unified LVLM. As a result, we establish a simple but +robust LVLM baseline, Video-LLaVA, which learns from a mixed dataset of images +and videos, mutually enhancing each other. Video-LLaVA achieves superior +performances on a broad range of 9 image benchmarks across 5 image +question-answering datasets and 4 image benchmark toolkits. Additionally, our +Video-LLaVA also outperforms Video-ChatGPT by 5.8%, 9.9%, 18.6%, and 10.1% on +MSRVTT, MSVD, TGIF, and ActivityNet, respectively. Notably, extensive +experiments demonstrate that Video-LLaVA mutually benefits images and videos +within a unified visual representation, outperforming models designed +specifically for images or videos. We aim for this work to provide modest +insights into the multi-modal inputs for the LLM. + +
+
+
+
+
+ + ♻ ☆ Open Sesame! Universal Black Box Jailbreaking of Large Language Models + + +
+ Large language models (LLMs), designed to provide helpful and safe responses, +often rely on alignment techniques to align with user intent and social +guidelines. Unfortunately, this alignment can be exploited by malicious actors +seeking to manipulate an LLM's outputs for unintended purposes. In this paper +we introduce a novel approach that employs a genetic algorithm (GA) to +manipulate LLMs when model architecture and parameters are inaccessible. The GA +attack works by optimizing a universal adversarial prompt that -- when combined +with a user's query -- disrupts the attacked model's alignment, resulting in +unintended and potentially harmful outputs. Our novel approach systematically +reveals a model's limitations and vulnerabilities by uncovering instances where +its responses deviate from expected behavior. Through extensive experiments we +demonstrate the efficacy of our technique, thus contributing to the ongoing +discussion on responsible AI development by providing a diagnostic tool for +evaluating and enhancing alignment of LLMs with human intent. To our knowledge +this is the first automated universal black box jailbreak attack. + +
+
+
+
+
+ + ♻ ☆ UMAAF: Unveiling Aesthetics via Multifarious Attributes of Images + + +
+ With the increasing prevalence of smartphones and websites, Image Aesthetic +Assessment (IAA) has become increasingly crucial. While the significance of +attributes in IAA is widely recognized, many attribute-based methods lack +consideration for the selection and utilization of aesthetic attributes. Our +initial step involves the acquisition of aesthetic attributes from both intra- +and inter-perspectives. Within the intra-perspective, we extract the direct +visual attributes of images, constituting the absolute attribute. In the +inter-perspective, our focus lies in modeling the relative score relationships +between images within the same sequence, forming the relative attribute. Then, +to better utilize image attributes in aesthetic assessment, we propose the +Unified Multi-attribute Aesthetic Assessment Framework (UMAAF) to model both +absolute and relative attributes of images. For absolute attributes, we +leverage multiple absolute-attribute perception modules and an +absolute-attribute interacting network. The absolute-attribute perception +modules are first pre-trained on several absolute-attribute learning tasks and +then used to extract corresponding absolute attribute features. The +absolute-attribute interacting network adaptively learns the weight of diverse +absolute-attribute features, effectively integrating them with generic +aesthetic features from various absolute-attribute perspectives and generating +the aesthetic prediction. To model the relative attribute of images, we +consider the relative ranking and relative distance relationships between +images in a Relative-Relation Loss function, which boosts the robustness of the +UMAAF. Furthermore, UMAAF achieves state-of-the-art performance on TAD66K and +AVA datasets, and multiple experiments demonstrate the effectiveness of each +module and the model's alignment with human preference. + +
+
+
+
+
+ + ♻ ☆ Layer-wise Auto-Weighting for Non-Stationary Test-Time Adaptation WACV 2024 + + +
+ Given the inevitability of domain shifts during inference in real-world +applications, test-time adaptation (TTA) is essential for model adaptation +after deployment. However, the real-world scenario of continuously changing +target distributions presents challenges including catastrophic forgetting and +error accumulation. Existing TTA methods for non-stationary domain shifts, +while effective, incur excessive computational load, making them impractical +for on-device settings. In this paper, we introduce a layer-wise auto-weighting +algorithm for continual and gradual TTA that autonomously identifies layers for +preservation or concentrated adaptation. By leveraging the Fisher Information +Matrix (FIM), we first design the learning weight to selectively focus on +layers associated with log-likelihood changes while preserving unrelated ones. +Then, we further propose an exponential min-max scaler to make certain layers +nearly frozen while mitigating outliers. This minimizes forgetting and error +accumulation, leading to efficient adaptation to non-stationary target +distribution. Experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C show our +method outperforms conventional continual and gradual TTA approaches while +significantly reducing computational load, highlighting the importance of +FIM-based learning weight in adapting to continuously or gradually shifting +target domains. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ Multi-level Geometric Optimization for Regularised Constrained Linear + Inverse Problems + + +
+ We present a geometric multilevel optimization approach that smoothly +incorporates box constraints. Given a box constrained optimization problem, we +consider a hierarchy of models with varying discretization levels. Finer models +are accurate but expensive to compute, while coarser models are less accurate +but cheaper to compute. When working at the fine level, multilevel optimisation +computes the search direction based on a coarser model which speeds up updates +at the fine level. Moreover, exploiting geometry induced by the hierarchy the +feasibility of the updates is preserved. In particular, our approach extends +classical components of multigrid methods like restriction and prolongation to +the Riemannian structure of our constraints. + +
+
+ comment: 25 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ YolOOD: Utilizing Object Detection Concepts for Multi-Label + Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection has attracted a large amount of attention +from the machine learning research community in recent years due to its +importance in deployed systems. Most of the previous studies focused on the +detection of OOD samples in the multi-class classification task. However, OOD +detection in the multi-label classification task, a more common real-world use +case, remains an underexplored domain. In this research, we propose YolOOD - a +method that utilizes concepts from the object detection domain to perform OOD +detection in the multi-label classification task. Object detection models have +an inherent ability to distinguish between objects of interest +(in-distribution) and irrelevant objects (e.g., OOD objects) in images that +contain multiple objects belonging to different class categories. These +abilities allow us to convert a regular object detection model into an image +classifier with inherent OOD detection capabilities with just minor changes. We +compare our approach to state-of-the-art OOD detection methods and demonstrate +YolOOD's ability to outperform these methods on a comprehensive suite of +in-distribution and OOD benchmark datasets. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Decoupling Dynamic Monocular Videos for Dynamic View Synthesis + + +
+ The challenge of dynamic view synthesis from dynamic monocular videos, i.e., +synthesizing novel views for free viewpoints given a monocular video of a +dynamic scene captured by a moving camera, mainly lies in accurately modeling +the dynamic objects of a scene using limited 2D frames, each with a varying +timestamp and viewpoint. Existing methods usually require pre-processed 2D +optical flow and depth maps by off-the-shelf methods to supervise the network, +making them suffer from the inaccuracy of the pre-processed supervision and the +ambiguity when lifting the 2D information to 3D. In this paper, we tackle this +challenge in an unsupervised fashion. Specifically, we decouple the motion of +the dynamic objects into object motion and camera motion, respectively +regularized by proposed unsupervised surface consistency and patch-based +multi-view constraints. The former enforces the 3D geometric surfaces of moving +objects to be consistent over time, while the latter regularizes their +appearances to be consistent across different viewpoints. Such a fine-grained +motion formulation can alleviate the learning difficulty for the network, thus +enabling it to produce not only novel views with higher quality but also more +accurate scene flows and depth than existing methods requiring extra +supervision. + +
+
+
+
+
+ + ♻ ☆ Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation + + +
+ One primary topic of multi-modal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multi-modal cooperation, which could not jointly +utilize all modalities well. Some methods are proposed to identify and enhance +the worse learnt modality, but are often hard to provide the fine-grained +observation of multi-modal cooperation at sample-level with theoretical +support. Hence, it is essential to reasonably observe and improve the +fine-grained cooperation between modalities, especially when facing realistic +scenarios where the modality discrepancy could vary across different samples. +To this end, we introduce a fine-grained modality valuation metric to evaluate +the contribution of each modality at sample-level. Via modality valuation, we +regretfully observe that the multi-modal model tends to rely on one specific +modality, resulting in other modalities being low-contributing. We further +analyze this issue and improve cooperation between modalities by enhancing the +discriminative ability of low-contributing modalities in a targeted manner. +Overall, our methods reasonably observe the fine-grained uni-modal contribution +at sample-level and achieve considerable improvement on different multi-modal +models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Assessing Domain Gap for Continual Domain Adaptation in Object Detection + + +
+ To ensure reliable object detection in autonomous systems, the detector must +be able to adapt to changes in appearance caused by environmental factors such +as time of day, weather, and seasons. Continually adapting the detector to +incorporate these changes is a promising solution, but it can be +computationally costly. Our proposed approach is to selectively adapt the +detector only when necessary, using new data that does not have the same +distribution as the current training data. To this end, we investigate three +popular metrics for domain gap evaluation and find that there is a correlation +between the domain gap and detection accuracy. Therefore, we apply the domain +gap as a criterion to decide when to adapt the detector. Our experiments show +that our approach has the potential to improve the efficiency of the detector's +operation in real-world scenarios, where environmental conditions change in a +cyclical manner, without sacrificing the overall performance of the detector. +Our code is publicly available at https://github.com/dadung/DGE-CDA. + +
+
+ comment: Accepted to CVIU +
+
+
+
+
+ + ♻ ☆ Pink: Unveiling the Power of Referential Comprehension for Multi-modal + LLMs + + +
+ Multi-modal Large Language Models (MLLMs) have shown remarkable capabilities +in various multi-modal tasks. Nevertheless, their performance in fine-grained +image understanding tasks is still limited. To address this issue, this paper +proposes a new framework to enhance the fine-grained image understanding +abilities of MLLMs. Specifically, we present a new method for constructing the +instruction tuning dataset at a low cost by leveraging annotations in existing +datasets. A self-consistent bootstrapping method is also introduced to extend +existing dense object annotations into high-quality +referring-expression-bounding-box pairs. These methods enable the generation of +high-quality instruction data which includes a wide range of fundamental +abilities essential for fine-grained image perception. Moreover, we argue that +the visual encoder should be tuned during instruction tuning to mitigate the +gap between full image perception and fine-grained image perception. +Experimental results demonstrate the superior performance of our method. For +instance, our model exhibits a 5.2% accuracy improvement over Qwen-VL on GQA +and surpasses the accuracy of Kosmos-2 by 24.7% on RefCOCO_val. We also attain +the top rank on the leaderboard of MMBench. This promising performance is +achieved by training on only publicly available data, making it easily +reproducible. The models, datasets, and codes are publicly available at +https://github.com/SY-Xuan/Pink. + +
+
+
+
+
+ + ♻ ☆ Image augmentation with conformal mappings for a convolutional neural + network + + +
+ For augmentation of the square-shaped image data of a convolutional neural +network (CNN), we introduce a new method, in which the original images are +mapped onto a disk with a conformal mapping, rotated around the center of this +disk and mapped under such a M\"obius transformation that preserves the disk, +and then mapped back onto their original square shape. This process does not +result the loss of information caused by removing areas from near the edges of +the original images unlike the typical transformations used in the data +augmentation for a CNN. We offer here the formulas of all the mappings needed +together with detailed instructions how to write a code for transforming the +images. The new method is also tested with simulated data and, according the +results, using this method to augment the training data of 10 images into 40 +images decreases the amount of the error in the predictions by a CNN for a test +set of 160 images in a statistically significant way (p-value=0.0360). + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ ColonMapper: topological mapping and localization for colonoscopy ICRA 2024 + + +
+ We propose a topological mapping and localization system able to operate on +real human colonoscopies, despite significant shape and illumination changes. +The map is a graph where each node codes a colon location by a set of real +images, while edges represent traversability between nodes. For close-in-time +images, where scene changes are minor, place recognition can be successfully +managed with the recent transformers-based local feature matching algorithms. +However, under long-term changes -- such as different colonoscopies of the same +patient -- feature-based matching fails. To address this, we train on real +colonoscopies a deep global descriptor achieving high recall with significant +changes in the scene. The addition of a Bayesian filter boosts the accuracy of +long-term place recognition, enabling relocalization in a previously built map. +Our experiments show that ColonMapper is able to autonomously build a map and +localize against it in two important use cases: localization within the same +colonoscopy or within different colonoscopies of the same patient. Code will be +available upon acceptance. + +
+
+ comment: Under review. ICRA 2024 +
+
+
+
+
+ + ♻ ☆ Energy-Calibrated VAE with Test Time Free Lunch + + +
+ In this paper, we propose a novel generative model that utilizes a +conditional Energy-Based Model (EBM) for enhancing Variational Autoencoder +(VAE), termed Energy-Calibrated VAE (EC-VAE). Specifically, VAEs often suffer +from blurry generated samples due to the lack of a tailored training on the +samples generated in the generative direction. On the other hand, EBMs can +generate high-quality samples but require expensive Markov Chain Monte Carlo +(MCMC) sampling. To address these issues, we introduce a conditional EBM for +calibrating the generative direction of VAE during training, without requiring +it for the generation at test time. In particular, we train EC-VAE upon both +the input data and the calibrated samples with adaptive weight to enhance +efficacy while avoiding MCMC sampling at test time. Furthermore, we extend the +calibration idea of EC-VAE to variational learning and normalizing flows, and +apply EC-VAE to an additional application of zero-shot image restoration via +neural transport prior and range-null theory. We evaluate the proposed method +with two applications, including image generation and zero-shot image +restoration, and the experimental results show that our method achieves the +state-of-the-art performance over single-step non-adversarial generation. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ Composite Score for Anomaly Detection in Imbalanced Real-World + Industrial Dataset + + +
+ In recent years, the industrial sector has evolved towards its fourth +revolution. The quality control domain is particularly interested in advanced +machine learning for computer vision anomaly detection. Nevertheless, several +challenges have to be faced, including imbalanced datasets, the image +complexity, and the zero-false-negative (ZFN) constraint to guarantee the +high-quality requirement. This paper illustrates a use case for an industrial +partner, where Printed Circuit Board Assembly (PCBA) images are first +reconstructed with a Vector Quantized Generative Adversarial Network (VQGAN) +trained on normal products. Then, several multi-level metrics are extracted on +a few normal and abnormal images, highlighting anomalies through reconstruction +differences. Finally, a classifer is trained to build a composite anomaly score +thanks to the metrics extracted. This three-step approach is performed on the +public MVTec-AD datasets and on the partner PCBA dataset, where it achieves a +regular accuracy of 95.69% and 87.93% under the ZFN constraint. + +
+
+ comment: This version of the article has been accepted for publication, after + peer review and is subject to Springer Nature AM terms of use, but is not the + Version of Record and does not reflect post-acceptance improvements, or any + corrections. The Version of Record is available online at: + https://doi.org/10.1007/s10994-023-06415-9 +
+
+
+
+
+ + ♻ ☆ Vocabulary-free Image Classification NeurIPS2023 + + +
+ Recent advances in large vision-language models have revolutionized the image +classification paradigm. Despite showing impressive zero-shot capabilities, a +pre-defined set of categories, a.k.a. the vocabulary, is assumed at test time +for composing the textual prompts. However, such assumption can be impractical +when the semantic context is unknown and evolving. We thus formalize a novel +task, termed as Vocabulary-free Image Classification (VIC), where we aim to +assign to an input image a class that resides in an unconstrained +language-induced semantic space, without the prerequisite of a known +vocabulary. VIC is a challenging task as the semantic space is extremely large, +containing millions of concepts, with hard-to-discriminate fine-grained +categories. In this work, we first empirically verify that representing this +semantic space by means of an external vision-language database is the most +effective way to obtain semantically relevant content for classifying the +image. We then propose Category Search from External Databases (CaSED), a +method that exploits a pre-trained vision-language model and an external +vision-language database to address VIC in a training-free manner. CaSED first +extracts a set of candidate categories from captions retrieved from the +database based on their semantic similarity to the image, and then assigns to +the image the best matching candidate category according to the same +vision-language model. Experiments on benchmark datasets validate that CaSED +outperforms other complex vision-language frameworks, while being efficient +with much fewer parameters, paving the way for future research in this +direction. + +
+
+ comment: Accepted at NeurIPS2023, 19 pages, 8 figures, code is available at + https://github.com/altndrr/vic +
+
+
+
+
+ + ♻ ☆ Unsupervised discovery of Interpretable Visual Concepts + + +
+ Providing interpretability of deep-learning models to non-experts, while +fundamental for a responsible real-world usage, is challenging. Attribution +maps from xAI techniques, such as Integrated Gradients, are a typical example +of a visualization technique containing a high level of information, but with +difficult interpretation. In this paper, we propose two methods, Maximum +Activation Groups Extraction (MAGE) and Multiscale Interpretable Visualization +(Ms-IV), to explain the model's decision, enhancing global interpretability. +MAGE finds, for a given CNN, combinations of features which, globally, form a +semantic meaning, that we call concepts. We group these similar feature +patterns by clustering in ``concepts'', that we visualize through Ms-IV. This +last method is inspired by Occlusion and Sensitivity analysis (incorporating +causality), and uses a novel metric, called Class-aware Order Correlation +(CaOC), to globally evaluate the most important image regions according to the +model's decision space. We compare our approach to xAI methods such as LIME and +Integrated Gradients. Experimental results evince the Ms-IV higher localization +and faithfulness values. Finally, qualitative evaluation of combined MAGE and +Ms-IV demonstrates humans' ability to agree, based on the visualization, with +the decision of clusters' concepts; and, to detect, among a given set of +networks, the existence of bias. + +
+
+
+
+
+ + ♻ ☆ Contextual Hourglass Network for Semantic Segmentation of High + Resolution Aerial Imagery ICIP 2019 + + +
+ Semantic segmentation for aerial imagery is a challenging and important +problem in remotely sensed imagery analysis. In recent years, with the success +of deep learning, various convolutional neural network (CNN) based models have +been developed. However, due to the varying sizes of the objects and imbalanced +class labels, it can be challenging to obtain accurate pixel-wise semantic +segmentation results. To address those challenges, we develop a novel semantic +segmentation method and call it Contextual Hourglass Network. In our method, in +order to improve the robustness of the prediction, we design a new contextual +hourglass module which incorporates attention mechanism on processed +low-resolution featuremaps to exploit the contextual semantics. We further +exploit the stacked encoder-decoder structure by connecting multiple contextual +hourglass modules from end to end. This architecture can effectively extract +rich multi-scale features and add more feedback loops for better learning +contextual semantics through intermediate supervision. To demonstrate the +efficacy of our semantic segmentation method, we test it on Potsdam and +Vaihingen datasets. Through the comparisons to other baseline methods, our +method yields the best results on overall performance. + +
+
+ comment: Accepted by ICIP 2019, + https://cmsworkshops.com/ICIP2019/Papers/AcceptedPapers.asp +
+
+
+
+
+ + ♻ ☆ Online Class-Incremental Learning For Real-World Food Classification WACV 2024 + + +
+ Food image classification is essential for monitoring health and tracking +dietary in image-based dietary assessment methods. However, conventional +systems often rely on static datasets with fixed classes and uniform +distribution. In contrast, real-world food consumption patterns, shaped by +cultural, economic, and personal influences, involve dynamic and evolving data. +Thus, require the classification system to cope with continuously evolving +data. Online Class Incremental Learning (OCIL) addresses the challenge of +learning continuously from a single-pass data stream while adapting to the new +knowledge and reducing catastrophic forgetting. Experience Replay (ER) based +OCIL methods store a small portion of previous data and have shown encouraging +performance. However, most existing OCIL works assume that the distribution of +encountered data is perfectly balanced, which rarely happens in real-world +scenarios. In this work, we explore OCIL for real-world food image +classification by first introducing a probabilistic framework to simulate +realistic food consumption scenarios. Subsequently, we present an attachable +Dynamic Model Update (DMU) module designed for existing ER methods, which +enables the selection of relevant images for model training, addressing +challenges arising from data repetition and imbalanced sample occurrences +inherent in realistic food consumption patterns within the OCIL framework. Our +performance evaluation demonstrates significant enhancements compared to +established ER methods, showing great potential for lifelong learning in +real-world food image classification scenarios. The code of our method is +publicly accessible at +\href{https://gitlab.com/viper-purdue/OCIL-real-world-food-image-classification}{https://gitlab.com/viper-purdue/OCIL-real-world-food-image-classification} + +
+
+ comment: Accepted at IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV 2024) +
+
+
+
+
+ + ♻ ☆ Robot Hand-Eye Calibration using Structure-from-Motion + + +
+ In this paper we propose a new flexible method for hand-eye calibration. The +vast majority of existing hand-eye calibration techniques requires a +calibration rig which is used in conjunction with camera pose estimation +methods. Instead, we combine structure-from-motion with known robot motions and +we show that the solution can be obtained in linear form. The latter solves for +both the hand-eye parameters and for the unknown scale factor inherent with +structure-from-motion methods. The algebraic analysis that is made possible +with such a linear formulation allows to investigate not only the well known +case of general screw motions but also such singular motions as pure +translations, pure rotations, and planar motions. In essence, the robot-mounted +camera looks to an unknown rigid layout, tracks points over an image sequence +and estimates the camera-to-robot relationship. Such a self calibration process +is relevant for unmanned vehicles, robots working in remote places, and so +forth. We conduct a large number of experiments which validate the quality of +the method by comparing it with existing ones. + +
+
+
+
+
+ + ♻ ☆ On Counterfactual Data Augmentation Under Confounding + + +
+ Counterfactual data augmentation has recently emerged as a method to mitigate +confounding biases in the training data. These biases, such as spurious +correlations, arise due to various observed and unobserved confounding +variables in the data generation process. In this paper, we formally analyze +how confounding biases impact downstream classifiers and present a causal +viewpoint to the solutions based on counterfactual data augmentation. We +explore how removing confounding biases serves as a means to learn invariant +features, ultimately aiding in generalization beyond the observed data +distribution. Additionally, we present a straightforward yet powerful algorithm +for generating counterfactual images, which effectively mitigates the influence +of confounding effects on downstream classifiers. Through experiments on MNIST +variants and the CelebA datasets, we demonstrate how our simple augmentation +method helps existing state-of-the-art methods achieve good results. + +
+
+
+
+
+ + ♻ ☆ PFENet++: Boosting Few-shot Semantic Segmentation with the + Noise-filtered Context-aware Prior Mask + + +
+ In this work, we revisit the prior mask guidance proposed in ``Prior Guided +Feature Enrichment Network for Few-Shot Segmentation''. The prior mask serves +as an indicator that highlights the region of interests of unseen categories, +and it is effective in achieving better performance on different frameworks of +recent studies. However, the current method directly takes the maximum +element-to-element correspondence between the query and support features to +indicate the probability of belonging to the target class, thus the broader +contextual information is seldom exploited during the prior mask generation. To +address this issue, first, we propose the Context-aware Prior Mask (CAPM) that +leverages additional nearby semantic cues for better locating the objects in +query images. Second, since the maximum correlation value is vulnerable to +noisy features, we take one step further by incorporating a lightweight Noise +Suppression Module (NSM) to screen out the unnecessary responses, yielding +high-quality masks for providing the prior knowledge. Both two contributions +are experimentally shown to have substantial practical merit, and the new model +named PFENet++ significantly outperforms the baseline PFENet as well as all +other competitors on three challenging benchmarks PASCAL-5$^i$, COCO-20$^i$ and +FSS-1000. The new state-of-the-art performance is achieved without compromising +the efficiency, manifesting the potential for being a new strong baseline in +few-shot semantic segmentation. Our code will be available at +https://github.com/luoxiaoliu/PFENet2Plus. + +
+
+ comment: The first two authors contribute equally and are listed in + alphabetical order +
+
+
+
+
+ + ♻ ☆ Self supervised convolutional kernel based handcrafted feature + harmonization: Enhanced left ventricle hypertension disease phenotyping on + echocardiography + + +
+ Radiomics, a medical imaging technique, extracts quantitative handcrafted +features from images to predict diseases. Harmonization in those features +ensures consistent feature extraction across various imaging devices and +protocols. Methods for harmonization include standardized imaging protocols, +statistical adjustments, and evaluating feature robustness. Myocardial diseases +such as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD) +are diagnosed via echocardiography, but variable imaging settings pose +challenges. Harmonization techniques are crucial for applying handcrafted +features in disease diagnosis in such scenario. Self-supervised learning (SSL) +enhances data understanding within limited datasets and adapts to diverse data +settings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying +superior performance in various tasks. This study focuses on convolutional +filters within SSL, using them as preprocessing to convert images into feature +maps for handcrafted feature harmonization. Our proposed method excelled in +harmonization evaluation and exhibited superior LVH classification performance +compared to existing methods. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ DistNet2D: Leveraging long-range temporal information for efficient + segmentation and tracking + + +
+ Extracting long tracks and lineages from videomicroscopy requires an +extremely low error rate, which is challenging on complex datasets of dense or +deforming cells. Leveraging temporal context is key to overcoming this +challenge. We propose DistNet2D, a new deep neural network (DNN) architecture +for 2D cell segmentation and tracking that leverages both mid- and long-term +temporal information. DistNet2D considers seven frames at the input and uses a +post-processing procedure that exploits information from the entire video to +correct segmentation errors. DistNet2D outperforms two recent methods on two +experimental datasets, one containing densely packed bacterial cells and the +other containing eukaryotic cells. It is integrated into an ImageJ-based +graphical user interface for 2D data visualization, curation, and training. +Finally, we demonstrate the performance of DistNet2D on correlating the size +and shape of cells with their transport properties over large statistics, for +both bacterial and eukaryotic cells. + +
+
+ comment: 40 pages, 5 figures, 18 supp figures +
+
+
+
+
+ + ♻ ☆ A Data-Free Approach to Mitigate Catastrophic Forgetting in Federated + Class Incremental Learning for Vision Tasks NeurIPS 2023 + + +
+ Deep learning models often suffer from forgetting previously learned +information when trained on new data. This problem is exacerbated in federated +learning (FL), where the data is distributed and can change independently for +each user. Many solutions are proposed to resolve this catastrophic forgetting +in a centralized setting. However, they do not apply directly to FL because of +its unique complexities, such as privacy concerns and resource limitations. To +overcome these challenges, this paper presents a framework for +$\textbf{federated class incremental learning}$ that utilizes a generative +model to synthesize samples from past distributions. This data can be later +exploited alongside the training data to mitigate catastrophic forgetting. To +preserve privacy, the generative model is trained on the server using data-free +methods at the end of each task without requesting data from clients. Moreover, +our solution does not demand the users to store old data or models, which gives +them the freedom to join/leave the training at any time. Additionally, we +introduce SuperImageNet, a new regrouping of the ImageNet dataset specifically +tailored for federated continual learning. We demonstrate significant +improvements compared to existing baselines through extensive experiments on +multiple datasets. + +
+
+ comment: Accepted in NeurIPS 2023. arXiv admin note: text overlap with + arXiv:2307.00497 +
+
+
+
+
+ + ♻ ☆ NU-MCC: Multiview Compressive Coding with Neighborhood Decoder and + Repulsive UDF NeurIPS 2023 + + +
+ Remarkable progress has been made in 3D reconstruction from single-view RGB-D +inputs. MCC is the current state-of-the-art method in this field, which +achieves unprecedented success by combining vision Transformers with +large-scale training. However, we identified two key limitations of MCC: 1) The +Transformer decoder is inefficient in handling large number of query points; 2) +The 3D representation struggles to recover high-fidelity details. In this +paper, we propose a new approach called NU-MCC that addresses these +limitations. NU-MCC includes two key innovations: a Neighborhood decoder and a +Repulsive Unsigned Distance Function (Repulsive UDF). First, our Neighborhood +decoder introduces center points as an efficient proxy of input visual +features, allowing each query point to only attend to a small neighborhood. +This design not only results in much faster inference speed but also enables +the exploitation of finer-scale visual features for improved recovery of 3D +textures. Second, our Repulsive UDF is a novel alternative to the occupancy +field used in MCC, significantly improving the quality of 3D object +reconstruction. Compared to standard UDFs that suffer from holes in results, +our proposed Repulsive UDF can achieve more complete surface reconstruction. +Experimental results demonstrate that NU-MCC is able to learn a strong 3D +representation, significantly advancing the state of the art in single-view 3D +reconstruction. Particularly, it outperforms MCC by 9.7% in terms of the +F1-score on the CO3D-v2 dataset with more than 5x faster running speed. + +
+
+ comment: NeurIPS 2023. Project page: https://numcc.github.io/ Code: + https://github.com/sail-sg/numcc +
+
+
+
+
+ + ♻ ☆ GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting + + +
+ In this paper, we introduce $\textbf{GS-SLAM}$ that first utilizes 3D +Gaussian representation in the Simultaneous Localization and Mapping (SLAM) +system. It facilitates a better balance between efficiency and accuracy. +Compared to recent SLAM methods employing neural implicit representations, our +method utilizes a real-time differentiable splatting rendering pipeline that +offers significant speedup to map optimization and RGB-D re-rendering. +Specifically, we propose an adaptive expansion strategy that adds new or +deletes noisy 3D Gaussian in order to efficiently reconstruct new observed +scene geometry and improve the mapping of previously observed areas. This +strategy is essential to extend 3D Gaussian representation to reconstruct the +whole scene rather than synthesize a static object in existing methods. +Moreover, in the pose tracking process, an effective coarse-to-fine technique +is designed to select reliable 3D Gaussian representations to optimize camera +pose, resulting in runtime reduction and robust estimation. Our method achieves +competitive performance compared with existing state-of-the-art real-time +methods on the Replica, TUM-RGBD datasets. The source code will be released +soon. + +
+
+
+
+
+ + ♻ ☆ Formalizing and Evaluating Requirements of Perception Systems for + Automated Vehicles using Spatio-Temporal Perception Logic + + +
+ Automated vehicles (AV) heavily depend on robust perception systems. Current +methods for evaluating vision systems focus mainly on frame-by-frame +performance. Such evaluation methods appear to be inadequate in assessing the +performance of a perception subsystem when used within an AV. In this paper, we +present a logic -- referred to as Spatio-Temporal Perception Logic (STPL) -- +which utilizes both spatial and temporal modalities. STPL enables reasoning +over perception data using spatial and temporal operators. One major advantage +of STPL is that it facilitates basic sanity checks on the functional +performance of the perception system, even without ground-truth data in some +cases. We identify a fragment of STPL which is efficiently monitorable offline +in polynomial time. Finally, we present a range of specifications for AV +perception systems to highlight the types of requirements that can be expressed +and analyzed through offline monitoring with STPL. + +
+
+ comment: 32 pages, 11 figures, 6 tables, 4 algorithms, 2 appendixes +
+
+
+
+
+ + ♻ ☆ LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene + Graphs with Weak Supervision + + +
+ We propose LASER, a neuro-symbolic approach to learn semantic video +representations that capture rich spatial and temporal properties in video data +by leveraging high-level logic specifications. In particular, we formulate the +problem in terms of alignment between raw videos and spatio-temporal logic +specifications. The alignment algorithm leverages a differentiable symbolic +reasoner and a combination of contrastive, temporal, and semantics losses. It +effectively and efficiently trains low-level perception models to extract +fine-grained video representation in the form of a spatio-temporal scene graph +that conforms to the desired high-level specification. In doing so, we explore +a novel methodology that weakly supervises the learning of video semantic +representations through logic specifications. We evaluate our method on two +datasets with rich spatial and temporal specifications: +20BN-Something-Something and MUGEN. We demonstrate that our method learns +better fine-grained video semantics than existing baselines. + +
+
+
+
+
+ + ♻ ☆ Learning Sub-Pixel Disparity Distribution for Light Field Depth + Estimation + + +
+ Light field (LF) depth estimation plays a crucial role in many LF-based +applications. Existing LF depth estimation methods consider depth estimation as +a regression problem, where a pixel-wise L1 loss is employed to supervise the +training process. However, the disparity map is only a sub-space projection +(i.e., an expectation) of the disparity distribution, which is essential for +models to learn. In this paper, we propose a simple yet effective method to +learn the sub-pixel disparity distribution by fully utilizing the power of deep +networks, especially for LF of narrow baselines. We construct the cost volume +at the sub-pixel level to produce a finer disparity distribution and design an +uncertainty-aware focal loss to supervise the predicted disparity distribution +toward the ground truth. Extensive experimental results demonstrate the +effectiveness of our method.Our method significantly outperforms recent +state-of-the-art LF depth algorithms on the HCI 4D LF Benchmark in terms of all +four accuracy metrics (i.e., BadPix 0.01, BadPix 0.03, BadPix 0.07, and MSE +$\times$100). The code and model of the proposed method are available at +\url{https://github.com/chaowentao/SubFocal}. + +
+
+ comment: Accepted by IEEE Transactions on Computational Imaging +
+
+
+
+
+ + ♻ ☆ LDP: Language-driven Dual-Pixel Image Defocus Deblurring Network + + +
+ Recovering sharp images from dual-pixel (DP) pairs with disparity-dependent +blur is a challenging task.~Existing blur map-based deblurring methods have +demonstrated promising results. In this paper, we propose, to the best of our +knowledge, the first framework that introduces the contrastive language-image +pre-training framework (CLIP) to accurately estimate the blur map from a DP +pair unsupervisedly. To achieve this, we first carefully design text prompts to +enable CLIP to understand blur-related geometric prior knowledge from the DP +pair. Then, we propose a format to input a stereo DP pair to CLIP without any +fine-tuning, despite the fact that CLIP is pre-trained on monocular images. +Given the estimated blur map, we introduce a blur-prior attention block, a +blur-weighting loss, and a blur-aware loss to recover the all-in-focus image. +Our method achieves state-of-the-art performance in extensive experiments (see +Fig.~\ref{fig:teaser}). + +
+
+
+
+
+ + ♻ ☆ Improved Defect Detection and Classification Method for Advanced IC + Nodes by Using Slicing Aided Hyper Inference with Refinement Strategy + + +
+ In semiconductor manufacturing, lithography has often been the manufacturing +step defining the smallest possible pattern dimensions. In recent years, +progress has been made towards high-NA (Numerical Aperture) EUVL +(Extreme-Ultraviolet-Lithography) paradigm, which promises to advance pattern +shrinking (2 nm node and beyond). However, a significant increase in stochastic +defects and the complexity of defect detection becomes more pronounced with +high-NA. Present defect inspection techniques (both non-machine learning and +machine learning based), fail to achieve satisfactory performance at high-NA +dimensions. In this work, we investigate the use of the Slicing Aided Hyper +Inference (SAHI) framework for improving upon current techniques. Using SAHI, +inference is performed on size-increased slices of the SEM images. This leads +to the object detector's receptive field being more effective in capturing +small defect instances. First, the performance on previously investigated +semiconductor datasets is benchmarked across various configurations, and the +SAHI approach is demonstrated to substantially enhance the detection of small +defects, by approx. 2x. Afterwards, we also demonstrated application of SAHI +leads to flawless detection rates on a new test dataset, with scenarios not +encountered during training, whereas previous trained models failed. Finally, +we formulate an extension of SAHI that does not significantly reduce +true-positive predictions while eliminating false-positive predictions. + +
+
+ comment: 12 pages, 9 figures, to be presented at International Conference on + Machine Intelligence with Applications (ICMIA), and to be published in + conference proceedings by AIP +
+
+
+
+
+ + ♻ ☆ Implicit Event-RGBD Neural SLAM + + +
+ Implicit neural SLAM has achieved remarkable progress recently. Nevertheless, +existing methods face significant challenges in non-ideal scenarios, such as +motion blur or lighting variation, which often leads to issues like convergence +failures, localization drifts, and distorted mapping. To address these +challenges, we propose $\textbf{EN-SLAM}$, the first event-RGBD implicit neural +SLAM framework, which effectively leverages the high rate and high dynamic +range advantages of event data for tracking and mapping. Specifically, EN-SLAM +proposes a differentiable CRF (Camera Response Function) rendering technique to +generate distinct RGB and event camera data via a shared radiance field, which +is optimized by learning a unified implicit representation with the captured +event and RGBD supervision. Moreover, based on the temporal difference property +of events, we propose a temporal aggregating optimization strategy for the +event joint tracking and global bundle adjustment, capitalizing on the +consecutive difference constraints of events, significantly enhancing tracking +accuracy and robustness. Finally, we construct the simulated dataset +$\textbf{DEV-Indoors}$ and real captured dataset $\textbf{DEV-Reals}$ +containing 6 scenes, 17 sequences with practical motion blur and lighting +changes for evaluations. Experimental results show that our method outperforms +the SOTA methods in both tracking ATE and mapping ACC with a real-time $17$ FPS +in various challenging environments. The code and dataset will be released +soon. + +
+
+
+
+
+ + ♻ ☆ Instance Segmentation under Occlusions via Location-aware Copy-Paste + Data Augmentation + + +
+ Occlusion is a long-standing problem in computer vision, particularly in +instance segmentation. ACM MMSports 2023 DeepSportRadar has introduced a +dataset that focuses on segmenting human subjects within a basketball context +and a specialized evaluation metric for occlusion scenarios. Given the modest +size of the dataset and the highly deformable nature of the objects to be +segmented, this challenge demands the application of robust data augmentation +techniques and wisely-chosen deep learning architectures. Our work (ranked 1st +in the competition) first proposes a novel data augmentation technique, capable +of generating more training samples with wider distribution. Then, we adopt a +new architecture - Hybrid Task Cascade (HTC) framework with CBNetV2 as backbone +and MaskIoU head to improve segmentation performance. Furthermore, we employ a +Stochastic Weight Averaging (SWA) training strategy to improve the model's +generalization. As a result, we achieve a remarkable occlusion score (OM) of +0.533 on the challenge dataset, securing the top-1 position on the leaderboard. +Source code is available at this +https://github.com/nguyendinhson-kaist/MMSports23-Seg-AutoID. + +
+
+
+
+
+ + ♻ ☆ ProtoCLIP: Prototypical Contrastive Language Image Pretraining + + +
+ Contrastive Language Image Pretraining (CLIP) has received widespread +attention, since its learned representations can be transferred well to various +downstream tasks. During the training process of the CLIP model, the InfoNCE +objective aligns positive image-text pairs and separates negative ones. We show +an underlying representation grouping effect during this process: the InfoNCE +objective indirectly groups semantically similar representations together via +randomly emerged within-modal anchors. Based on this understanding, in this +paper, Prototypical Contrastive Language Image Pretraining (ProtoCLIP) is +introduced to enhance such grouping by boosting its efficiency and increasing +its robustness against the modality gap. Specifically, ProtoCLIP sets up +prototype-level discrimination between image and text spaces, which efficiently +transfers higher-level structural knowledge. Further, Prototypical Back +Translation (PBT) is proposed to decouple representation grouping from +representation alignment, resulting in effective learning of meaningful +representations under large modality gap. The PBT also enables us to introduce +additional external teachers with richer prior language knowledge. ProtoCLIP is +trained with an online episodic training strategy, which makes it can be scaled +up to unlimited amounts of data. We train our ProtoCLIP on Conceptual Captions +and achieved an +5.81% ImageNet linear probing improvement and an +2.01% +ImageNet zero-shot classification improvement. On the larger YFCC-15M dataset, +ProtoCLIP matches the performance of CLIP with 33% of training time. Codes are +available at https://github.com/megvii-research/protoclip. + +
+
+ comment: Accepted by IEEE Transactions on Neural Networks and Learning Systems + (TNNLS) +
+
+
+
+
+ + ♻ ☆ Revealing the preference for correcting separated aberrations in joint + optic-image design + + +
+ The joint design of the optical system and the downstream algorithm is a +challenging and promising task. Due to the demand for balancing the global +optimal of imaging systems and the computational cost of physical simulation, +existing methods cannot achieve efficient joint design of complex systems such +as smartphones and drones. In this work, starting from the perspective of the +optical design, we characterize the optics with separated aberrations. +Additionally, to bridge the hardware and software without gradients, an image +simulation system is presented to reproduce the genuine imaging procedure of +lenses with large field-of-views. As for aberration correction, we propose a +network to perceive and correct the spatially varying aberrations and validate +its superiority over state-of-the-art methods. Comprehensive experiments reveal +that the preference for correcting separated aberrations in joint design is as +follows: longitudinal chromatic aberration, lateral chromatic aberration, +spherical aberration, field curvature, and coma, with astigmatism coming last. +Drawing from the preference, a 10% reduction in the total track length of the +consumer-level mobile phone lens module is accomplished. Moreover, this +procedure spares more space for manufacturing deviations, realizing +extreme-quality enhancement of computational photography. The optimization +paradigm provides innovative insight into the practical joint design of +sophisticated optical systems and post-processing algorithms. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ♻ ☆ Scale-aware competition network for palmprint recognition + + +
+ Palmprint biometrics garner heightened attention in palm-scanning payment and +social security due to their distinctive attributes. However, prevailing +methodologies singularly prioritize texture orientation, neglecting the +significant texture scale dimension. We design an innovative network for +concurrently extracting intra-scale and inter-scale features to redress this +limitation. This paper proposes a scale-aware competitive network (SAC-Net), +which includes the Inner-Scale Competition Module (ISCM) and the Across-Scale +Competition Module (ASCM) to capture texture characteristics related to +orientation and scale. ISCM efficiently integrates learnable Gabor filters and +a self-attention mechanism to extract rich orientation data and discern +textures with long-range discriminative properties. Subsequently, ASCM +leverages a competitive strategy across various scales to effectively +encapsulate the competitive texture scale elements. By synergizing ISCM and +ASCM, our method adeptly characterizes palmprint features. Rigorous +experimentation across three benchmark datasets unequivocally demonstrates our +proposed approach's exceptional recognition performance and resilience relative +to state-of-the-art alternatives. + +
+
+
+
+
+ + ♻ ☆ Towards Plastic and Stable Exemplar-Free Incremental Learning: A + Dual-Learner Framework with Cumulative Parameter Averaging + + +
+ The dilemma between plasticity and stability presents a significant challenge +in Incremental Learning (IL), especially in the exemplar-free scenario where +accessing old-task samples is strictly prohibited during the learning of a new +task. A straightforward solution to this issue is learning and storing an +independent model for each task, known as Single Task Learning (STL). Despite +the linear growth in model storage with the number of tasks in STL, we +empirically discover that averaging these model parameters can potentially +preserve knowledge across all tasks. Inspired by this observation, we propose a +Dual-Learner framework with Cumulative Parameter Averaging (DLCPA). DLCPA +employs a dual-learner design: a plastic learner focused on acquiring new-task +knowledge and a stable learner responsible for accumulating all learned +knowledge. The knowledge from the plastic learner is transferred to the stable +learner via cumulative parameter averaging. Additionally, several task-specific +classifiers work in cooperation with the stable learner to yield the final +prediction. Specifically, when learning a new task, these modules are updated +in a cyclic manner: i) the plastic learner is initially optimized using a +self-supervised loss besides the supervised loss to enhance the feature +extraction robustness; ii) the stable learner is then updated with respect to +the plastic learner in a cumulative parameter averaging manner to maintain its +task-wise generalization; iii) the task-specific classifier is accordingly +optimized to align with the stable learner. Experimental results on CIFAR-100 +and Tiny-ImageNet show that DLCPA outperforms several state-of-the-art +exemplar-free baselines in both Task-IL and Class-IL settings. + +
+
+
+
+
+ + ♻ ☆ Rethinking the Backward Propagation for Adversarial Transferability NeurIPS 2023 + + +
+ Transfer-based attacks generate adversarial examples on the surrogate model, +which can mislead other black-box models without access, making it promising to +attack real-world applications. Recently, several works have been proposed to +boost adversarial transferability, in which the surrogate model is usually +overlooked. In this work, we identify that non-linear layers (e.g., ReLU, +max-pooling, etc.) truncate the gradient during backward propagation, making +the gradient w.r.t. input image imprecise to the loss function. We hypothesize +and empirically validate that such truncation undermines the transferability of +adversarial examples. Based on these findings, we propose a novel method called +Backward Propagation Attack (BPA) to increase the relevance between the +gradient w.r.t. input image and loss function so as to generate adversarial +examples with higher transferability. Specifically, BPA adopts a non-monotonic +function as the derivative of ReLU and incorporates softmax with temperature to +smooth the derivative of max-pooling, thereby mitigating the information loss +during the backward propagation of gradients. Empirical results on the ImageNet +dataset demonstrate that not only does our method substantially boost the +adversarial transferability, but it is also general to existing transfer-based +attacks. Code is available at https://github.com/Trustworthy-AI-Group/RPA. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Extraction and Summarization of Explicit Video Content using Multi-Modal + Deep Learning + + +
+ With the increase in video-sharing platforms across the internet, it is +difficult for humans to moderate the data for explicit content. Hence, an +automated pipeline to scan through video data for explicit content has become +the need of the hour. We propose a novel pipeline that uses multi-modal deep +learning to first extract the explicit segments of input videos and then +summarize their content using text to determine its age appropriateness and age +rating. We also evaluate our pipeline's effectiveness in the end using standard +metrics. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Visual Attention-Prompted Prediction and Learning + + +
+ Explanation(attention)-guided learning is a method that enhances a model's +predictive power by incorporating human understanding during the training +phase. While attention-guided learning has shown promising results, it often +involves time-consuming and computationally expensive model retraining. To +address this issue, we introduce the attention-prompted prediction technique, +which enables direct prediction guided by the attention prompt without the need +for model retraining. However, this approach presents several challenges, +including: 1) How to incorporate the visual attention prompt into the model's +decision-making process and leverage it for future predictions even in the +absence of a prompt? and 2) How to handle the incomplete information from the +visual attention prompt? To tackle these challenges, we propose a novel +framework called Visual Attention-Prompted Prediction and Learning, which +seamlessly integrates visual attention prompts into the model's decision-making +process and adapts to images both with and without attention prompts for +prediction. To address the incomplete information of the visual attention +prompt, we introduce a perturbation-based attention map modification method. +Additionally, we propose an optimization-based mask aggregation method with a +new weight learning function for adaptive perturbed annotation aggregation in +the attention map modification process. Our overall framework is designed to +learn in an attention-prompt guided multi-task manner to enhance future +predictions even for samples without attention prompts and trained in an +alternating manner for better convergence. Extensive experiments conducted on +two datasets demonstrate the effectiveness of our proposed framework in +enhancing predictions for samples, both with and without provided prompts. + +
+
+
+
+
+ + ♻ ☆ GeoCLIP: Clip-Inspired Alignment between Locations and Images for + Effective Worldwide Geo-localization NeurIPS 2023 + + +
+ Worldwide Geo-localization aims to pinpoint the precise location of images +taken anywhere on Earth. This task has considerable challenges due to immense +variation in geographic landscapes. The image-to-image retrieval-based +approaches fail to solve this problem on a global scale as it is not feasible +to construct a large gallery of images covering the entire world. Instead, +existing approaches divide the globe into discrete geographic cells, +transforming the problem into a classification task. However, their performance +is limited by the predefined classes and often results in inaccurate +localizations when an image's location significantly deviates from its class +center. To overcome these limitations, we propose GeoCLIP, a novel +CLIP-inspired Image-to-GPS retrieval approach that enforces alignment between +the image and its corresponding GPS locations. GeoCLIP's location encoder +models the Earth as a continuous function by employing positional encoding +through random Fourier features and constructing a hierarchical representation +that captures information at varying resolutions to yield a semantically rich +high-dimensional feature suitable to use even beyond geo-localization. To the +best of our knowledge, this is the first work employing GPS encoding for +geo-localization. We demonstrate the efficacy of our method via extensive +experiments and ablations on benchmark datasets. We achieve competitive +performance with just 20% of training data, highlighting its effectiveness even +in limited-data settings. Furthermore, we qualitatively demonstrate +geo-localization using a text query by leveraging CLIP backbone of our image +encoder. The project webpage is available at: +https://vicentevivan.github.io/GeoCLIP + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Artifacts Mapping: Multi-Modal Semantic Mapping for Object Detection and + 3D Localization + + +
+ Geometric navigation is nowadays a well-established field of robotics and the +research focus is shifting towards higher-level scene understanding, such as +Semantic Mapping. When a robot needs to interact with its environment, it must +be able to comprehend the contextual information of its surroundings. This work +focuses on classifying and localising objects within a map, which is under +construction (SLAM) or already built. To further explore this direction, we +propose a framework that can autonomously detect and localize predefined +objects in a known environment using a multi-modal sensor fusion approach +(combining RGB and depth data from an RGB-D camera and a lidar). The framework +consists of three key elements: understanding the environment through RGB data, +estimating depth through multi-modal sensor fusion, and managing artifacts +(i.e., filtering and stabilizing measurements). The experiments show that the +proposed framework can accurately detect 98% of the objects in the real sample +environment, without post-processing, while 85% and 80% of the objects were +mapped using the single RGBD camera or RGB + lidar setup respectively. The +comparison with single-sensor (camera or lidar) experiments is performed to +show that sensor fusion allows the robot to accurately detect near and far +obstacles, which would have been noisy or imprecise in a purely visual or +laser-based approach. + +
+
+ comment: Accepted to the 11th European Conference on Mobile Robots (ECMR) 2023 +
+
+
+
+
+ + ♻ ☆ Generalized super-resolution 4D Flow MRI $\unicode{x2013}$ using + ensemble learning to extend across the cardiovascular system + + +
+ 4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive +measurement technique capable of quantifying blood flow across the +cardiovascular system. While practical use is limited by spatial resolution and +image noise, incorporation of trained super-resolution (SR) networks has +potential to enhance image quality post-scan. However, these efforts have +predominantly been restricted to narrowly defined cardiovascular domains, with +limited exploration of how SR performance extends across the cardiovascular +system; a task aggravated by contrasting hemodynamic conditions apparent across +the cardiovasculature. The aim of our study was to explore the generalizability +of SR 4D Flow MRI using a combination of heterogeneous training sets and +dedicated ensemble learning. With synthetic training data generated across +three disparate domains (cardiac, aortic, cerebrovascular), varying +convolutional base and ensemble learners were evaluated as a function of domain +and architecture, quantifying performance on both in-silico and acquired +in-vivo data from the same three domains. Results show that both bagging and +stacking ensembling enhance SR performance across domains, accurately +predicting high-resolution velocities from low-resolution input data in-silico. +Likewise, optimized networks successfully recover native resolution velocities +from downsampled in-vivo data, as well as show qualitative potential in +generating denoised SR-images from clinical level input data. In conclusion, +our work presents a viable approach for generalized SR 4D Flow MRI, with +ensemble learning extending utility across various clinical areas of interest. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ On the Challenges and Perspectives of Foundation Models for Medical + Image Analysis + + +
+ This article discusses the opportunities, applications and future directions +of large-scale pre-trained models, i.e., foundation models, for analyzing +medical images. Medical foundation models have immense potential in solving a +wide range of downstream tasks, as they can help to accelerate the development +of accurate and robust models, reduce the large amounts of required labeled +data, preserve the privacy and confidentiality of patient data. Specifically, +we illustrate the "spectrum" of medical foundation models, ranging from general +vision models, modality-specific models, to organ/task-specific models, +highlighting their challenges, opportunities and applications. We also discuss +how foundation models can be leveraged in downstream medical tasks to enhance +the accuracy and efficiency of medical image analysis, leading to more precise +diagnosis and treatment decisions. + +
+
+
+
+
+
+
+
+ + Information Retrieval 15 + +
+
+
+ + ☆ CSMeD: Bridging the Dataset Gap in Automated Citation Screening for + Systematic Literature Reviews NeurIPS 2023 + + +
+ Systematic literature reviews (SLRs) play an essential role in summarising, +synthesising and validating scientific evidence. In recent years, there has +been a growing interest in using machine learning techniques to automate the +identification of relevant studies for SLRs. However, the lack of standardised +evaluation datasets makes comparing the performance of such automated +literature screening systems difficult. In this paper, we analyse the citation +screening evaluation datasets, revealing that many of the available datasets +are either too small, suffer from data leakage or have limited applicability to +systems treating automated literature screening as a classification task, as +opposed to, for example, a retrieval or question-answering task. To address +these challenges, we introduce CSMeD, a meta-dataset consolidating nine +publicly released collections, providing unified access to 325 SLRs from the +fields of medicine and computer science. CSMeD serves as a comprehensive +resource for training and evaluating the performance of automated citation +screening models. Additionally, we introduce CSMeD-FT, a new dataset designed +explicitly for evaluating the full text publication screening task. To +demonstrate the utility of CSMeD, we conduct experiments and establish +baselines on new datasets. + +
+
+ comment: Accepted at NeurIPS 2023 Datasets and Benchmarks Track +
+
+
+
+
+ + ☆ InterPrompt: Interpretable Prompting for Interrelated Interpersonal Risk + Factors in Reddit Posts + + +
+ Mental health professionals and clinicians have observed the upsurge of +mental disorders due to Interpersonal Risk Factors (IRFs). To simulate the +human-in-the-loop triaging scenario for early detection of mental health +disorders, we recognized textual indications to ascertain these IRFs : Thwarted +Belongingness (TBe) and Perceived Burdensomeness (PBu) within personal +narratives. In light of this, we use N-shot learning with GPT-3 model on the +IRF dataset, and underscored the importance of fine-tuning GPT-3 model to +incorporate the context-specific sensitivity and the interconnectedness of +textual cues that represent both IRFs. + In this paper, we introduce an Interpretable Prompting (InterPrompt)} method +to boost the attention mechanism by fine-tuning the GPT-3 model. This allows a +more sophisticated level of language modification by adjusting the pre-trained +weights. Our model learns to detect usual patterns and underlying connections +across both the IRFs, which leads to better system-level explainability and +trustworthiness. The results of our research demonstrate that all four variants +of GPT-3 model, when fine-tuned with InterPrompt, perform considerably better +as compared to the baseline methods, both in terms of classification and +explanation generation. + +
+
+ comment: 5 pages +
+
+
+
+
+ + ☆ Linear-time online visibility graph transformation algorithm: for both + natural and horizontal visibility criteria + + +
+ Visibility graph (VG) transformation is a technique used to convert a time +series into a graph based on specific visibility criteria. It has attracted +increasing interest in the fields of time series analysis, forecasting, and +classification. Optimizing the VG transformation algorithm to accelerate the +process is a critical aspect of VG-related research, as it enhances the +applicability of VG transformation in latency-sensitive areas and conserves +computational resources. In the real world, many time series are presented in +the form of data streams. Despite the proposal of the concept of VG's online +functionality, previous studies have not thoroughly explored the acceleration +of VG transformation by leveraging the characteristics of data streams. In this +paper, we propose that an efficient online VG algorithm should adhere to two +criteria and develop a linear-time method, termed the LOT framework, for both +natural and horizontal visibility graph transformations in data stream +scenarios. Experiments are conducted on two datasets, comparing our approach +with five existing methods as baselines. The results demonstrate the validity +and promising computational efficiency of our framework. + +
+
+
+
+
+ + ☆ Utilizing Language Models for Tour Itinerary Recommendation IJCAI 2023 + + +
+ Tour itinerary recommendation involves planning a sequence of relevant +Point-of-Interest (POIs), which combines challenges from the fields of both +Operations Research (OR) and Recommendation Systems (RS). As an OR problem, +there is the need to maximize a certain utility (e.g., popularity of POIs in +the tour) while adhering to some constraints (e.g., maximum time for the tour). +As a RS problem, it is heavily related to problem or filtering or ranking a +subset of POIs that are relevant to a user and recommending it as part of an +itinerary. In this paper, we explore the use of language models for the task of +tour itinerary recommendation and planning. This task has the unique +requirement of recommending personalized POIs relevant to users and planning +these POIs as an itinerary that satisfies various constraints. We discuss some +approaches in this area, such as using word embedding techniques like Word2Vec +and GloVe for learning POI embeddings and transformer-based techniques like +BERT for generating + itineraries. + +
+
+ comment: PMAI23 @IJCAI 2023 2nd International Workshop on Process Management + in the AI era +
+
+
+
+
+ + ☆ A Survey on Large Language Models for Personalized and Explainable + Recommendations + + +
+ In recent years, Recommender Systems(RS) have witnessed a transformative +shift with the advent of Large Language Models(LLMs) in the field of Natural +Language Processing(NLP). These models such as OpenAI's GPT-3.5/4, Llama from +Meta, have demonstrated unprecedented capabilities in understanding and +generating human-like text. This has led to a paradigm shift in the realm of +personalized and explainable recommendations, as LLMs offer a versatile toolset +for processing vast amounts of textual data to enhance user experiences. To +provide a comprehensive understanding of the existing LLM-based recommendation +systems, this survey aims to analyze how RS can benefit from LLM-based +methodologies. Furthermore, we describe major challenges in Personalized +Explanation Generating(PEG) tasks, which are cold-start problems, unfairness +and bias problems in RS. + +
+
+
+
+
+ + ☆ Graph Neural Ordinary Differential Equations-based method for + Collaborative Filtering ICDM 2023 + + +
+ Graph Convolution Networks (GCNs) are widely considered state-of-the-art for +collaborative filtering. Although several GCN-based methods have been proposed +and achieved state-of-the-art performance in various tasks, they can be +computationally expensive and time-consuming to train if too many layers are +created. However, since the linear GCN model can be interpreted as a +differential equation, it is possible to transfer it to an ODE problem. This +inspired us to address the computational limitations of GCN-based models by +designing a simple and efficient NODE-based model that can skip some GCN layers +to reach the final state, thus avoiding the need to create many layers. In this +work, we propose a Graph Neural Ordinary Differential Equation-based method for +Collaborative Filtering (GODE-CF). This method estimates the final embedding by +utilizing the information captured by one or two GCN layers. To validate our +approach, we conducted experiments on multiple datasets. The results +demonstrate that our model outperforms competitive baselines, including +GCN-based models and other state-of-the-art CF methods. Notably, our proposed +GODE-CF model has several advantages over traditional GCN-based models. It is +simple, efficient, and has a fast training time, making it a practical choice +for real-world situations. + +
+
+ comment: Accepted by ICDM 2023 +
+
+
+
+
+ + ☆ Adapting LLMs for Efficient, Personalized Information Retrieval: Methods + and Implications + + +
+ The advent of Large Language Models (LLMs) heralds a pivotal shift in online +user interactions with information. Traditional Information Retrieval (IR) +systems primarily relied on query-document matching, whereas LLMs excel in +comprehending and generating human-like text, thereby enriching the IR +experience significantly. While LLMs are often associated with chatbot +functionalities, this paper extends the discussion to their explicit +application in information retrieval. We explore methodologies to optimize the +retrieval process, select optimal models, and effectively scale and orchestrate +LLMs, aiming for cost-efficiency and enhanced result accuracy. A notable +challenge, model hallucination-where the model yields inaccurate or +misinterpreted data-is addressed alongside other model-specific hurdles. Our +discourse extends to crucial considerations including user privacy, data +optimization, and the necessity for system clarity and interpretability. +Through a comprehensive examination, we unveil not only innovative strategies +for integrating Language Models (LLMs) with Information Retrieval (IR) systems, +but also the consequential considerations that underline the need for a +balanced approach aligned with user-centric principles. + +
+
+
+
+
+ + ☆ Equipping Pretrained Unconditional Music Transformers with Instrument + and Genre Controls + + +
+ The ''pretraining-and-finetuning'' paradigm has become a norm for training +domain-specific models in natural language processing and computer vision. In +this work, we aim to examine this paradigm for symbolic music generation +through leveraging the largest ever symbolic music dataset sourced from the +MuseScore forum. We first pretrain a large unconditional transformer model +using 1.5 million songs. We then propose a simple technique to equip this +pretrained unconditional music transformer model with instrument and genre +controls by finetuning the model with additional control tokens. Our proposed +representation offers improved high-level controllability and expressiveness +against two existing representations. The experimental results show that the +proposed model can successfully generate music with user-specified instruments +and genre. In a subjective listening test, the proposed model outperforms the +pretrained baseline model in terms of coherence, harmony, arrangement and +overall quality. + +
+
+
+
+
+ + ☆ Don't forget private retrieval: distributed private similarity search + for large language models + + +
+ While the flexible capabilities of large language models (LLMs) allow them to +answer a range of queries based on existing learned knowledge, information +retrieval to augment generation is an important tool to allow LLMs to answer +questions on information not included in pre-training data. Such private +information is increasingly being generated in a wide array of distributed +contexts by organizations and individuals. Performing such information +retrieval using neural embeddings of queries and documents always leaked +information about queries and database content unless both were stored locally. +We present Private Retrieval Augmented Generation (PRAG), an approach that uses +multi-party computation (MPC) to securely transmit queries to a distributed set +of servers containing a privately constructed database to return top-k and +approximate top-k documents. This is a first-of-its-kind approach to dense +information retrieval that ensures no server observes a client's query or can +see the database content. The approach introduces a novel MPC friendly protocol +for inverted file approximate search (IVF) that allows for fast document search +over distributed and private data in sublinear communication complexity. This +work presents new avenues through which data for use in LLMs can be accessed +and used without needing to centralize or forgo privacy. + +
+
+
+
+
+ + ☆ Attribute-Aware Deep Hashing with Self-Consistency for Large-Scale + Fine-Grained Image Retrieval + + +
+ Our work focuses on tackling large-scale fine-grained image retrieval as +ranking the images depicting the concept of interests (i.e., the same +sub-category labels) highest based on the fine-grained details in the query. It +is desirable to alleviate the challenges of both fine-grained nature of small +inter-class variations with large intra-class variations and explosive growth +of fine-grained data for such a practical task. In this paper, we propose +attribute-aware hashing networks with self-consistency for generating +attribute-aware hash codes to not only make the retrieval process efficient, +but also establish explicit correspondences between hash codes and visual +attributes. Specifically, based on the captured visual representations by +attention, we develop an encoder-decoder structure network of a reconstruction +task to unsupervisedly distill high-level attribute-specific vectors from the +appearance-specific visual representations without attribute annotations. Our +models are also equipped with a feature decorrelation constraint upon these +attribute vectors to strengthen their representative abilities. Then, driven by +preserving original entities' similarity, the required hash codes can be +generated from these attribute-specific vectors and thus become +attribute-aware. Furthermore, to combat simplicity bias in deep hashing, we +consider the model design from the perspective of the self-consistency +principle and propose to further enhance models' self-consistency by equipping +an additional image reconstruction path. Comprehensive quantitative experiments +under diverse empirical settings on six fine-grained retrieval datasets and two +generic retrieval datasets show the superiority of our models over competing +methods. + +
+
+ comment: Accepted by IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Relphormer: Relational Graph Transformer for Knowledge Graph + Representations + + +
+ Transformers have achieved remarkable performance in widespread fields, +including natural language processing, computer vision and graph mining. +However, vanilla Transformer architectures have not yielded promising +improvements in the Knowledge Graph (KG) representations, where the +translational distance paradigm dominates this area. Note that vanilla +Transformer architectures struggle to capture the intrinsically heterogeneous +structural and semantic information of knowledge graphs. To this end, we +propose a new variant of Transformer for knowledge graph representations dubbed +Relphormer. Specifically, we introduce Triple2Seq which can dynamically sample +contextualized sub-graph sequences as the input to alleviate the heterogeneity +issue. We propose a novel structure-enhanced self-attention mechanism to encode +the relational information and keep the semantic information within entities +and relations. Moreover, we utilize masked knowledge modeling for general +knowledge graph representation learning, which can be applied to various +KG-based tasks including knowledge graph completion, question answering, and +recommendation. Experimental results on six datasets show that Relphormer can +obtain better performance compared with baselines. Code is available in +https://github.com/zjunlp/Relphormer. + +
+
+ comment: Neurocomputing 2023 +
+
+
+
+
+ + ♻ ☆ Budgeted Embedding Table For Recommender Systems WSDM 2024 + + +
+ At the heart of contemporary recommender systems (RSs) are latent factor +models that provide quality recommendation experience to users. These models +use embedding vectors, which are typically of a uniform and fixed size, to +represent users and items. As the number of users and items continues to grow, +this design becomes inefficient and hard to scale. Recent lightweight embedding +methods have enabled different users and items to have diverse embedding sizes, +but are commonly subject to two major drawbacks. Firstly, they limit the +embedding size search to optimizing a heuristic balancing the recommendation +quality and the memory complexity, where the trade-off coefficient needs to be +manually tuned for every memory budget requested. The implicitly enforced +memory complexity term can even fail to cap the parameter usage, making the +resultant embedding table fail to meet the memory budget strictly. Secondly, +most solutions, especially reinforcement learning based ones derive and +optimize the embedding size for each each user/item on an instance-by-instance +basis, which impedes the search efficiency. In this paper, we propose Budgeted +Embedding Table (BET), a novel method that generates table-level actions (i.e., +embedding sizes for all users and items) that is guaranteed to meet +pre-specified memory budgets. Furthermore, by leveraging a set-based action +formulation and engaging set representation learning, we present an innovative +action search strategy powered by an action fitness predictor that efficiently +evaluates each table-level action. Experiments have shown state-of-the-art +performance on two real-world datasets when BET is paired with three popular +recommender models under different memory budgets. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ♻ ☆ Meta-optimized Contrastive Learning for Sequential Recommendation SIGIR2023 + + +
+ Contrastive Learning (CL) performances as a rising approach to address the +challenge of sparse and noisy recommendation data. Although having achieved +promising results, most existing CL methods only perform either hand-crafted +data or model augmentation for generating contrastive pairs to find a proper +augmentation operation for different datasets, which makes the model hard to +generalize. Additionally, since insufficient input data may lead the encoder to +learn collapsed embeddings, these CL methods expect a relatively large number +of training data (e.g., large batch size or memory bank) to contrast. However, +not all contrastive pairs are always informative and discriminative enough for +the training processing. Therefore, a more general CL-based recommendation +model called Meta-optimized Contrastive Learning for sequential Recommendation +(MCLRec) is proposed in this work. By applying both data augmentation and +learnable model augmentation operations, this work innovates the standard CL +framework by contrasting data and model augmented views for adaptively +capturing the informative features hidden in stochastic data augmentation. +Moreover, MCLRec utilizes a meta-learning manner to guide the updating of the +model augmenters, which helps to improve the quality of contrastive pairs +without enlarging the amount of input data. Finally, a contrastive +regularization term is considered to encourage the augmentation model to +generate more informative augmented views and avoid too similar contrastive +pairs within the meta updating. The experimental results on commonly used +datasets validate the effectiveness of MCLRec. + +
+
+ comment: 11 Pages,8 figures,SIGIR2023 +
+
+
+
+
+ + ♻ ☆ Multi-Resolution Diffusion for Privacy-Sensitive Recommender Systems + + +
+ While recommender systems have become an integral component of the Web +experience, their heavy reliance on user data raises privacy and security +concerns. Substituting user data with synthetic data can address these +concerns, but accurately replicating these real-world datasets has been a +notoriously challenging problem. Recent advancements in generative AI have +demonstrated the impressive capabilities of diffusion models in generating +realistic data across various domains. In this work we introduce a Score-based +Diffusion Recommendation Module (SDRM), which captures the intricate patterns +of real-world datasets required for training highly accurate recommender +systems. SDRM allows for the generation of synthetic data that can replace +existing datasets to preserve user privacy, or augment existing datasets to +address excessive data sparsity. Our method outperforms competing baselines +such as generative adversarial networks, variational autoencoders, and recently +proposed diffusion models in synthesizing various datasets to replace or +augment the original data by an average improvement of 4.30% in Recall@$k$ and +4.65% in NDCG@$k$. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ ChoralSynth: Synthetic Dataset of Choral Singing + + +
+ Choral singing, a widely practiced form of ensemble singing, lacks +comprehensive datasets in the realm of Music Information Retrieval (MIR) +research, due to challenges arising from the requirement to curate multitrack +recordings. To address this, we devised a novel methodology, leveraging +state-of-the-art synthesizers to create and curate quality renditions. The +scores were sourced from Choral Public Domain Library(CPDL). This work is done +in collaboration with a diverse team of musicians, software engineers and +researchers. The resulting dataset, complete with its associated metadata, and +methodology is released as part of this work, opening up new avenues for +exploration and advancement in the field of singing voice research. + +
+
+ comment: Dataset Link: https://doi.org/10.5281/zenodo.10137883 +
+
+
+
+
+
+
+
+ + Machine Learning 169 + +
+
+
+ + ☆ Physics-guided Shape-from-Template: Monocular Video Perception through + Neural Surrogate Models + + +
+ 3D reconstruction of dynamic scenes is a long-standing problem in computer +graphics and increasingly difficult the less information is available. +Shape-from-Template (SfT) methods aim to reconstruct a template-based geometry +from RGB images or video sequences, often leveraging just a single monocular +camera without depth information, such as regular smartphone recordings. +Unfortunately, existing reconstruction methods are either unphysical and noisy +or slow in optimization. To solve this problem, we propose a novel SfT +reconstruction algorithm for cloth using a pre-trained neural surrogate model +that is fast to evaluate, stable, and produces smooth reconstructions due to a +regularizing physics simulation. Differentiable rendering of the simulated mesh +enables pixel-wise comparisons between the reconstruction and a target video +sequence that can be used for a gradient-based optimization procedure to +extract not only shape information but also physical parameters such as +stretching, shearing, or bending stiffness of the cloth. This allows to retain +a precise, stable, and smooth reconstructed geometry while reducing the runtime +by a factor of 400-500 compared to $\phi$-SfT, a state-of-the-art physics-based +SfT approach. + +
+
+
+
+
+ + ☆ Mechanistically analyzing the effects of fine-tuning on procedurally + defined tasks + + +
+ Fine-tuning large pre-trained models has become the de facto strategy for +developing both task-specific and general-purpose machine learning systems, +including developing models that are safe to deploy. Despite its clear +importance, there has been minimal work that explains how fine-tuning alters +the underlying capabilities learned by a model during pretraining: does +fine-tuning yield entirely novel capabilities or does it just modulate existing +ones? We address this question empirically in synthetic, controlled settings +where we can use mechanistic interpretability tools (e.g., network pruning and +probing) to understand how the model's underlying capabilities are changing. We +perform an extensive analysis of the effects of fine-tuning in these settings, +and show that: (i) fine-tuning rarely alters the underlying model capabilities; +(ii) a minimal transformation, which we call a 'wrapper', is typically learned +on top of the underlying model capabilities, creating the illusion that they +have been modified; and (iii) further fine-tuning on a task where such hidden +capabilities are relevant leads to sample-efficient 'revival' of the +capability, i.e., the model begins reusing these capability after only a few +gradient steps. This indicates that practitioners can unintentionally remove a +model's safety wrapper merely by fine-tuning it on a, e.g., superficially +unrelated, downstream task. We additionally perform analysis on language models +trained on the TinyStories dataset to support our claims in a more realistic +setup. + +
+
+
+
+
+ + ☆ Optimality in Mean Estimation: Beyond Worst-Case, Beyond Sub-Gaussian, + and Beyond $1+α$ Moments NeurIPS 2023 + + +
+ There is growing interest in improving our algorithmic understanding of +fundamental statistical problems such as mean estimation, driven by the goal of +understanding the limits of what we can extract from valuable data. The state +of the art results for mean estimation in $\mathbb{R}$ are 1) the optimal +sub-Gaussian mean estimator by [LV22], with the tight sub-Gaussian constant for +all distributions with finite but unknown variance, and 2) the analysis of the +median-of-means algorithm by [BCL13] and a lower bound by [DLLO16], +characterizing the big-O optimal errors for distributions for which only a +$1+\alpha$ moment exists for $\alpha \in (0,1)$. Both results, however, are +optimal only in the worst case. We initiate the fine-grained study of the mean +estimation problem: Can algorithms leverage useful features of the input +distribution to beat the sub-Gaussian rate, without explicit knowledge of such +features? + We resolve this question with an unexpectedly nuanced answer: "Yes in limited +regimes, but in general no". For any distribution $p$ with a finite mean, we +construct a distribution $q$ whose mean is well-separated from $p$'s, yet $p$ +and $q$ are not distinguishable with high probability, and $q$ further +preserves $p$'s moments up to constants. The main consequence is that no +reasonable estimator can asymptotically achieve better than the sub-Gaussian +error rate for any distribution, matching the worst-case result of [LV22]. More +generally, we introduce a new definitional framework to analyze the +fine-grained optimality of algorithms, which we call "neighborhood optimality", +interpolating between the unattainably strong "instance optimality" and the +trivially weak "admissibility" definitions. Applying the new framework, we show +that median-of-means is neighborhood optimal, up to constant factors. It is +open to find a neighborhood-optimal estimator without constant factor +slackness. + +
+
+ comment: 27 pages, to appear in NeurIPS 2023. Abstract shortened to fit arXiv + limit +
+
+
+
+
+ + ☆ Quantifying Impairment and Disease Severity Using AI Models Trained on + Healthy Subjects + + +
+ Automatic assessment of impairment and disease severity is a key challenge in +data-driven medicine. We propose a novel framework to address this challenge, +which leverages AI models trained exclusively on healthy individuals. The +COnfidence-Based chaRacterization of Anomalies (COBRA) score exploits the +decrease in confidence of these models when presented with impaired or diseased +patients to quantify their deviation from the healthy population. We applied +the COBRA score to address a key limitation of current clinical evaluation of +upper-body impairment in stroke patients. The gold-standard Fugl-Meyer +Assessment (FMA) requires in-person administration by a trained assessor for +30-45 minutes, which restricts monitoring frequency and precludes physicians +from adapting rehabilitation protocols to the progress of each patient. The +COBRA score, computed automatically in under one minute, is shown to be +strongly correlated with the FMA on an independent test cohort for two +different data modalities: wearable sensors ($\rho = 0.845$, 95% CI +[0.743,0.908]) and video ($\rho = 0.746$, 95% C.I [0.594, 0.847]). To +demonstrate the generalizability of the approach to other conditions, the COBRA +score was also applied to quantify severity of knee osteoarthritis from +magnetic-resonance imaging scans, again achieving significant correlation with +an independent clinical assessment ($\rho = 0.644$, 95% C.I [0.585,0.696]). + +
+
+ comment: 32 pages, 10 figures +
+
+
+
+
+ + ☆ High-resolution Image-based Malware Classification using Multiple + Instance Learning + + +
+ This paper proposes a novel method of classifying malware into families using +high-resolution greyscale images and multiple instance learning to overcome +adversarial binary enlargement. Current methods of visualisation-based malware +classification largely rely on lossy transformations of inputs such as resizing +to handle the large, variable-sized images. Through empirical analysis and +experimentation, it is shown that these approaches cause crucial information +loss that can be exploited. The proposed solution divides the images into +patches and uses embedding-based multiple instance learning with a +convolutional neural network and an attention aggregation function for +classification. The implementation is evaluated on the Microsoft Malware +Classification dataset and achieves accuracies of up to $96.6\%$ on +adversarially enlarged samples compared to the baseline of $22.8\%$. The Python +code is available online at https://github.com/timppeters/MIL-Malware-Images . + +
+
+ comment: 14 pages, 13 figures, 2 tables +
+
+
+
+
+ + ☆ SelfOcc: Self-Supervised Vision-Based 3D Occupancy Prediction + + +
+ 3D occupancy prediction is an important task for the robustness of +vision-centric autonomous driving, which aims to predict whether each point is +occupied in the surrounding 3D space. Existing methods usually require 3D +occupancy labels to produce meaningful results. However, it is very laborious +to annotate the occupancy status of each voxel. In this paper, we propose +SelfOcc to explore a self-supervised way to learn 3D occupancy using only video +sequences. We first transform the images into the 3D space (e.g., bird's eye +view) to obtain 3D representation of the scene. We directly impose constraints +on the 3D representations by treating them as signed distance fields. We can +then render 2D images of previous and future frames as self-supervision signals +to learn the 3D representations. We propose an MVS-embedded strategy to +directly optimize the SDF-induced weights with multiple depth proposals. Our +SelfOcc outperforms the previous best method SceneRF by 58.7% using a single +frame as input on SemanticKITTI and is the first self-supervised work that +produces reasonable 3D occupancy for surround cameras on Occ3D. SelfOcc +produces high-quality depth and achieves state-of-the-art results on novel +depth synthesis, monocular depth estimation, and surround-view depth estimation +on the SemanticKITTI, KITTI-2015, and nuScenes, respectively. Code: +https://github.com/huang-yh/SelfOcc. + +
+
+ comment: Code is available at: https://github.com/huang-yh/SelfOcc +
+
+
+
+
+ + ☆ Learning to Optimise Wind Farms with Graph Transformers + + +
+ This work proposes a novel data-driven model capable of providing accurate +predictions for the power generation of all wind turbines in wind farms of +arbitrary layout, yaw angle configurations and wind conditions. The proposed +model functions by encoding a wind farm into a fully-connected graph and +processing the graph representation through a graph transformer. The graph +transformer surrogate is shown to generalise well and is able to uncover latent +structural patterns within the graph representation of wind farms. It is +demonstrated how the resulting surrogate model can be used to optimise yaw +angle configurations using genetic algorithms, achieving similar levels of +accuracy to industrially-standard wind farm simulation tools while only taking +a fraction of the computational cost. + +
+
+
+
+
+ + ☆ Image Transformation for IoT Time-Series Data: A Review + + +
+ In the era of the Internet of Things (IoT), where smartphones, built-in +systems, wireless sensors, and nearly every smart device connect through local +networks or the internet, billions of smart things communicate with each other +and generate vast amounts of time-series data. As IoT time-series data is +high-dimensional and high-frequency, time-series classification or regression +has been a challenging issue in IoT. Recently, deep learning algorithms have +demonstrated superior performance results in time-series data classification in +many smart and intelligent IoT applications. However, it is hard to explore the +hidden dynamic patterns and trends in time-series. Recent studies show that +transforming IoT data into images improves the performance of the learning +model. In this paper, we present a review of these studies which use image +transformation/encoding techniques in IoT domain. We examine the studies +according to their encoding techniques, data types, and application areas. +Lastly, we emphasize the challenges and future dimensions of image +transformation. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Content Augmented Graph Neural Networks + + +
+ In recent years, graph neural networks (GNNs) have become a popular tool for +solving various problems over graphs. In these models, the link structure of +the graph is typically exploited and nodes' embeddings are iteratively updated +based on adjacent nodes. Nodes' contents are used solely in the form of feature +vectors, served as nodes' first-layer embeddings. However, the filters or +convolutions, applied during iterations/layers to these initial embeddings lead +to their impact diminish and contribute insignificantly to the final +embeddings. In order to address this issue, in this paper we propose augmenting +nodes' embeddings by embeddings generating from their content, at higher GNN +layers. More precisely, we propose models wherein a structural embedding using +a GNN and a content embedding are computed for each node. These two are +combined using a combination layer to form the embedding of a node at a given +layer. We suggest methods such as using an auto-encoder or building a content +graph, to generate content embeddings. In the end, by conducting experiments +over several real-world datasets, we demonstrate the high accuracy and +performance of our models. + +
+
+
+
+
+ + ☆ Exploring Graph Classification Techniques Under Low Data Constraints: A + Comprehensive Study + + +
+ This survey paper presents a brief overview of recent research on graph data +augmentation and few-shot learning. It covers various techniques for graph data +augmentation, including node and edge perturbation, graph coarsening, and graph +generation, as well as the latest developments in few-shot learning, such as +meta-learning and model-agnostic meta-learning. The paper explores these areas +in depth and delves into further sub classifications. Rule based approaches and +learning based approaches are surveyed under graph augmentation techniques. +Few-Shot Learning on graphs is also studied in terms of metric learning +techniques and optimization-based techniques. In all, this paper provides an +extensive array of techniques that can be employed in solving graph processing +problems faced in low-data scenarios. + +
+
+
+
+
+ + ☆ Soft Random Sampling: A Theoretical and Empirical Analysis + + +
+ Soft random sampling (SRS) is a simple yet effective approach for efficient +training of large-scale deep neural networks when dealing with massive data. +SRS selects a subset uniformly at random with replacement from the full data +set in each epoch. In this paper, we conduct a theoretical and empirical +analysis of SRS. First, we analyze its sampling dynamics including data +coverage and occupancy. Next, we investigate its convergence with non-convex +objective functions and give the convergence rate. Finally, we provide its +generalization performance. We empirically evaluate SRS for image recognition +on CIFAR10 and automatic speech recognition on Librispeech and an in-house +payload dataset to demonstrate its effectiveness. Compared to existing +coreset-based data selection methods, SRS offers a better accuracy-efficiency +trade-off. Especially on real-world industrial scale data sets, it is shown to +be a powerful training strategy with significant speedup and competitive +performance with almost no additional computing cost. + +
+
+
+
+
+ + ☆ Attacking Motion Planners Using Adversarial Perception Errors + + +
+ Autonomous driving (AD) systems are often built and tested in a modular +fashion, where the performance of different modules is measured using +task-specific metrics. These metrics should be chosen so as to capture the +downstream impact of each module and the performance of the system as a whole. +For example, high perception quality should enable prediction and planning to +be performed safely. Even though this is true in general, we show here that it +is possible to construct planner inputs that score very highly on various +perception quality metrics but still lead to planning failures. In an analogy +to adversarial attacks on image classifiers, we call such inputs +\textbf{adversarial perception errors} and show they can be systematically +constructed using a simple boundary-attack algorithm. We demonstrate the +effectiveness of this algorithm by finding attacks for two different black-box +planners in several urban and highway driving scenarios using the CARLA +simulator. Finally, we analyse the properties of these attacks and show that +they are isolated in the input space of the planner, and discuss their +implications for AD system deployment and testing. + +
+
+
+
+
+ + ☆ minimax: Efficient Baselines for Autocurricula in JAX + + +
+ Unsupervised environment design (UED) is a form of automatic curriculum +learning for training robust decision-making agents to zero-shot transfer into +unseen environments. Such autocurricula have received much interest from the RL +community. However, UED experiments, based on CPU rollouts and GPU model +updates, have often required several weeks of training. This compute +requirement is a major obstacle to rapid innovation for the field. This work +introduces the minimax library for UED training on accelerated hardware. Using +JAX to implement fully-tensorized environments and autocurriculum algorithms, +minimax allows the entire training loop to be compiled for hardware +acceleration. To provide a petri dish for rapid experimentation, minimax +includes a tensorized grid-world based on MiniGrid, in addition to reusable +abstractions for conducting autocurricula in procedurally-generated +environments. With these components, minimax provides strong UED baselines, +including new parallelized variants, which achieve over 120$\times$ speedups in +wall time compared to previous implementations when training with equal batch +sizes. The minimax library is available under the Apache 2.0 license at +https://github.com/facebookresearch/minimax. + +
+
+ comment: Presented at ALOE 2023 +
+
+
+
+
+ + ☆ Attacks of fairness in Federated Learning + + +
+ Federated Learning is an important emerging distributed training paradigm +that keeps data private on clients. It is now well understood that by +controlling only a small subset of FL clients, it is possible to introduce a +backdoor to a federated learning model, in the presence of certain attributes. +In this paper, we present a new type of attack that compromises the fairness of +the trained model. Fairness is understood to be the attribute-level performance +distribution of a trained model. It is particularly salient in domains where, +for example, skewed accuracy discrimination between subpopulations could have +disastrous consequences. We find that by employing a threat model similar to +that of a backdoor attack, an attacker is able to influence the aggregated +model to have an unfair performance distribution between any given set of +attributes. Furthermore, we find that this attack is possible by controlling +only a single client. While combating naturally induced unfairness in FL has +previously been discussed in depth, its artificially induced kind has been +neglected. We show that defending against attacks on fairness should be a +critical consideration in any situation where unfairness in a trained model +could benefit a user who participated in its training. + +
+
+
+
+
+ + ☆ Regression-Based Analysis of Multimodal Single-Cell Data Integration + Strategies + + +
+ Multimodal single-cell technologies enable the simultaneous collection of +diverse data types from individual cells, enhancing our understanding of +cellular states. However, the integration of these datatypes and modeling the +interrelationships between modalities presents substantial computational and +analytical challenges in disease biomarker detection and drug discovery. +Established practices rely on isolated methodologies to investigate individual +molecular aspects separately, often resulting in inaccurate analyses. To +address these obstacles, distinct Machine Learning Techniques are leveraged, +each of its own kind to model the co-variation of DNA to RNA, and finally to +surface proteins in single cells during hematopoietic stem cell development, +which simplifies understanding of underlying cellular mechanisms and immune +responses. Experiments conducted on a curated subset of a 300,000-cell time +course dataset, highlights the exceptional performance of Echo State Networks, +boasting a remarkable state-of-the-art correlation score of 0.94 and 0.895 on +Multi-omic and CiteSeq datasets. Beyond the confines of this study, these +findings hold promise for advancing comprehension of cellular differentiation +and function, leveraging the potential of Machine Learning. + +
+
+
+
+
+ + ☆ Fair Text Classification with Wasserstein Independence + + +
+ Group fairness is a central research topic in text classification, where +reaching fair treatment between sensitive groups (e.g. women vs. men) remains +an open challenge. This paper presents a novel method for mitigating biases in +neural text classification, agnostic to the model architecture. Considering the +difficulty to distinguish fair from unfair information in a text encoder, we +take inspiration from adversarial training to induce Wasserstein independence +between representations learned to predict our target label and the ones +learned to predict some sensitive attribute. Our approach provides two +significant advantages. Firstly, it does not require annotations of sensitive +attributes in both testing and training data. This is more suitable for +real-life scenarios compared to existing methods that require annotations of +sensitive attributes at train time. Second, our approach exhibits a comparable +or better fairness-accuracy trade-off compared to existing methods. + +
+
+
+
+
+ + ☆ On the Out-of-Distribution Coverage of Combining Split Conformal + Prediction and Bayesian Deep Learning + + +
+ Bayesian deep learning and conformal prediction are two methods that have +been used to convey uncertainty and increase safety in machine learning +systems. We focus on combining Bayesian deep learning with split conformal +prediction and how this combination effects out-of-distribution coverage; +particularly in the case of multiclass image classification. We suggest that if +the model is generally underconfident on the calibration set, then the +resultant conformal sets may exhibit worse out-of-distribution coverage +compared to simple predictive credible sets. Conversely, if the model is +overconfident on the calibration set, the use of conformal prediction may +improve out-of-distribution coverage. We evaluate prediction sets as a result +of combining split conformal methods and neural networks trained with (i) +stochastic gradient descent, (ii) deep ensembles, and (iii) mean-field +variational inference. Our results suggest that combining Bayesian deep +learning models with split conformal prediction can, in some cases, cause +unintended consequences such as reducing out-of-distribution coverage. + +
+
+ comment: 26 pages, 18 figures +
+
+
+
+
+ + ☆ Managing ML-Based Application Non-Functional Behavior: A Multi-Model + Approach + + +
+ Modern applications are increasingly driven by Machine Learning (ML) models +whose non-deterministic behavior is affecting the entire application life cycle +from design to operation. The pervasive adoption of ML is urgently calling for +approaches that guarantee a stable non-functional behavior of ML-based +applications over time and across model changes. To this aim, non-functional +properties of ML models, such as privacy, confidentiality, fairness, and +explainability, must be monitored, verified, and maintained. This need is even +more pressing when modern applications operate in the edge-cloud continuum, +increasing their complexity and dynamicity. Existing approaches mostly focus on +i) implementing classifier selection solutions according to the functional +behavior of ML models, ii) finding new algorithmic solutions to this need, such +as continuous re-training. In this paper, we propose a multi-model approach +built on dynamic classifier selection, where multiple ML models showing similar +non-functional properties are made available to the application and one model +is selected over time according to (dynamic and unpredictable) contextual +changes. Our solution goes beyond the state of the art by providing an +architectural and methodological approach that continuously guarantees a stable +non-functional behavior of ML-based applications, is applicable to different ML +models, and is driven by non-functional properties assessed on the models +themselves. It consists of a two-step process working during application +operation, where model assessment verifies non-functional properties of ML +models trained and selected at development time, and model substitution +guarantees a continuous and stable support of non-functional properties. We +experimentally evaluate our solution in a real-world scenario focusing on +non-functional property fairness. + +
+
+ comment: 13 pages, 12 figures +
+
+
+
+
+ + ☆ Adversarial Reweighting Guided by Wasserstein Distance for Bias + Mitigation + + +
+ The unequal representation of different groups in a sample population can +lead to discrimination of minority groups when machine learning models make +automated decisions. To address these issues, fairness-aware machine learning +jointly optimizes two (or more) metrics aiming at predictive effectiveness and +low unfairness. However, the inherent under-representation of minorities in the +data makes the disparate treatment of subpopulations less noticeable and +difficult to deal with during learning. In this paper, we propose a novel +adversarial reweighting method to address such \emph{representation bias}. To +balance the data distribution between the majority and the minority groups, our +approach deemphasizes samples from the majority group. To minimize empirical +risk, our method prefers samples from the majority group that are close to the +minority group as evaluated by the Wasserstein distance. Our theoretical +analysis shows the effectiveness of our adversarial reweighting approach. +Experiments demonstrate that our approach mitigates bias without sacrificing +classification accuracy, outperforming related state-of-the-art methods on +image and tabular benchmark datasets. + +
+
+
+
+
+ + ☆ BundleMoCap: Efficient, Robust and Smooth Motion Capture from Sparse + Multiview Videos + + +
+ Capturing smooth motions from videos using markerless techniques typically +involves complex processes such as temporal constraints, multiple stages with +data-driven regression and optimization, and bundle solving over temporal +windows. These processes can be inefficient and require tuning multiple +objectives across stages. In contrast, BundleMoCap introduces a novel and +efficient approach to this problem. It solves the motion capture task in a +single stage, eliminating the need for temporal smoothness objectives while +still delivering smooth motions. BundleMoCap outperforms the state-of-the-art +without increasing complexity. The key concept behind BundleMoCap is manifold +interpolation between latent keyframes. By relying on a local manifold +smoothness assumption, we can efficiently solve a bundle of frames using a +single code. Additionally, the method can be implemented as a sliding window +optimization and requires only the first frame to be properly initialized, +reducing the overall computational burden. BundleMoCap's strength lies in its +ability to achieve high-quality motion capture results with simplicity and +efficiency. More details can be found at https://moverseai.github.io/bundle/. + +
+
+ comment: Published in European Conference on Visual Media Production (CVMP + '23) +
+
+
+
+
+ + ☆ Interpretation of the Transformer and Improvement of the Extractor + + +
+ It has been over six years since the Transformer architecture was put +forward. Surprisingly, the vanilla Transformer architecture is still widely +used today. One reason is that the lack of deep understanding and comprehensive +interpretation of the Transformer architecture makes it more challenging to +improve the Transformer architecture. In this paper, we first interpret the +Transformer architecture comprehensively in plain words based on our +understanding and experiences. The interpretations are further proved and +verified. These interpretations also cover the Extractor, a family of drop-in +replacements for the multi-head self-attention in the Transformer architecture. +Then, we propose an improvement on a type of the Extractor that outperforms the +self-attention, without introducing additional trainable parameters. +Experimental results demonstrate that the improved Extractor performs even +better, showing a way to improve the Transformer architecture. + +
+
+
+
+
+ + ☆ Contrastive Left-Right Wearable Sensors (IMUs) Consistency Matching for + HAR + + +
+ Machine learning algorithms are improving rapidly, but annotating training +data remains a bottleneck for many applications. In this paper, we show how +real data can be used for self-supervised learning without any transformations +by taking advantage of the symmetry present in the activities. Our approach +involves contrastive matching of two different sensors (left and right wrist or +leg-worn IMUs) to make representations of co-occurring sensor data more similar +and those of non-co-occurring sensor data more different. We test our approach +on the Opportunity and MM-Fit datasets. In MM-Fit we show significant +improvement over the baseline supervised and self-supervised method SimCLR, +while for Opportunity there is significant improvement over the supervised +baseline and slight improvement when compared to SimCLR. Moreover, our method +improves supervised baselines even when using only a small amount of the data +for training. Future work should explore under which conditions our method is +beneficial for human activity recognition systems and other related +applications. + +
+
+ comment: Accepted at ABC 2023. The 5th International Conference on Activity + and Behavior Computing September 7th - 9th, 2023 in Kaiserslautern, Germany + (Hybrid) +
+
+
+
+
+ + ☆ Towards a more inductive world for drug repurposing approaches + + +
+ Drug-target interaction (DTI) prediction is a challenging, albeit essential +task in drug repurposing. Learning on graph models have drawn special attention +as they can significantly reduce drug repurposing costs and time commitment. +However, many current approaches require high-demanding additional information +besides DTIs that complicates their evaluation process and usability. +Additionally, structural differences in the learning architecture of current +models hinder their fair benchmarking. In this work, we first perform an +in-depth evaluation of current DTI datasets and prediction models through a +robust benchmarking process, and show that DTI prediction methods based on +transductive models lack generalization and lead to inflated performance when +evaluated as previously done in the literature, hence not being suited for drug +repurposing approaches. We then propose a novel biologically-driven strategy +for negative edge subsampling and show through in vitro validation that newly +discovered interactions are indeed true. We envision this work as the +underpinning for future fair benchmarking and robust model design. All +generated resources and tools are publicly available as a python package. + +
+
+
+
+
+ + ☆ SSVEP-DAN: A Data Alignment Network for SSVEP-based Brain Computer + Interfaces + + +
+ Steady-state visual-evoked potential (SSVEP)-based brain-computer interfaces +(BCIs) offer a non-invasive means of communication through high-speed speller +systems. However, their efficiency heavily relies on individual training data +obtained during time-consuming calibration sessions. To address the challenge +of data insufficiency in SSVEP-based BCIs, we present SSVEP-DAN, the first +dedicated neural network model designed for aligning SSVEP data across +different domains, which can encompass various sessions, subjects, or devices. +Our experimental results across multiple cross-domain scenarios demonstrate +SSVEP-DAN's capability to transform existing source SSVEP data into +supplementary calibration data, significantly enhancing SSVEP decoding accuracy +in scenarios with limited calibration data. We envision SSVEP-DAN as a catalyst +for practical SSVEP-based BCI applications with minimal calibration. The source +codes in this work are available at: https://github.com/CECNL/SSVEP-DAN. + +
+
+
+
+
+ + ☆ Carbohydrate NMR chemical shift predictions using E(3) equivariant graph + neural networks + + +
+ Carbohydrates, vital components of biological systems, are well-known for +their structural diversity. Nuclear Magnetic Resonance (NMR) spectroscopy plays +a crucial role in understanding their intricate molecular arrangements and is +essential in assessing and verifying the molecular structure of organic +molecules. An important part of this process is to predict the NMR chemical +shift from the molecular structure. This work introduces a novel approach that +leverages E(3) equivariant graph neural networks to predict carbohydrate NMR +spectra. Notably, our model achieves a substantial reduction in mean absolute +error, up to threefold, compared to traditional models that rely solely on +two-dimensional molecular structure. Even with limited data, the model excels, +highlighting its robustness and generalization capabilities. The implications +are far-reaching and go beyond an advanced understanding of carbohydrate +structures and spectral interpretation. For example, it could accelerate +research in pharmaceutical applications, biochemistry, and structural biology, +offering a faster and more reliable analysis of molecular structures. +Furthermore, our approach is a key step towards a new data-driven era in +spectroscopy, potentially influencing spectroscopic techniques beyond NMR. + +
+
+ comment: 13 pages, 9 figures, 2 tables +
+
+
+
+
+ + ☆ FedDRO: Federated Compositional Optimization for Distributionally Robust + Learning + + +
+ Recently, compositional optimization (CO) has gained popularity because of +its applications in distributionally robust optimization (DRO) and many other +machine learning problems. Large-scale and distributed availability of data +demands the development of efficient federated learning (FL) algorithms for +solving CO problems. Developing FL algorithms for CO is particularly +challenging because of the compositional nature of the objective. Moreover, +current state-of-the-art methods to solve such problems rely on large batch +gradients (depending on the solution accuracy) not feasible for most practical +settings. To address these challenges, in this work, we propose efficient +FedAvg-type algorithms for solving non-convex CO in the FL setting. We first +establish that vanilla FedAvg is not suitable to solve distributed CO problems +because of the data heterogeneity in the compositional objective at each client +which leads to the amplification of bias in the local compositional gradient +estimates. To this end, we propose a novel FL framework FedDRO that utilizes +the DRO problem structure to design a communication strategy that allows FedAvg +to control the bias in the estimation of the compositional gradient. A key +novelty of our work is to develop solution accuracy-independent algorithms that +do not require large batch gradients (and function evaluations) for solving +federated CO problems. We establish $\mathcal{O}(\epsilon^{-2})$ sample and +$\mathcal{O}(\epsilon^{-3/2})$ communication complexity in the FL setting while +achieving linear speedup with the number of clients. We corroborate our +theoretical findings with empirical studies on large-scale DRO problems. + +
+
+ comment: 38 Pages, 6 Figures +
+
+
+
+
+ + ☆ Careful Selection and Thoughtful Discarding: Graph Explicit Pooling + Utilizing Discarded Nodes + + +
+ Graph pooling has been increasingly recognized as crucial for Graph Neural +Networks (GNNs) to facilitate hierarchical graph representation learning. +Existing graph pooling methods commonly consist of two stages: selecting +top-ranked nodes and discarding the remaining to construct coarsened graph +representations. However, this paper highlights two key issues with these +methods: 1) The process of selecting nodes to discard frequently employs +additional Graph Convolutional Networks or Multilayer Perceptrons, lacking a +thorough evaluation of each node's impact on the final graph representation and +subsequent prediction tasks. 2) Current graph pooling methods tend to directly +discard the noise segment (dropped) of the graph without accounting for the +latent information contained within these elements. To address the first issue, +we introduce a novel Graph Explicit Pooling (GrePool) method, which selects +nodes by explicitly leveraging the relationships between the nodes and final +representation vectors crucial for classification. The second issue is +addressed using an extended version of GrePool (i.e., GrePool+), which applies +a uniform loss on the discarded nodes. This addition is designed to augment the +training process and improve classification accuracy. Furthermore, we conduct +comprehensive experiments across 12 widely used datasets to validate our +proposed method's effectiveness, including the Open Graph Benchmark datasets. +Our experimental results uniformly demonstrate that GrePool outperforms 14 +baseline methods for most datasets. Likewise, implementing GrePool+ enhances +GrePool's performance without incurring additional computational costs. + +
+
+ comment: 14 pages, 7 figures, 4 tables. Submitting to Science China + Information Sciences +
+
+
+
+
+ + ☆ Hierarchical Joint Graph Learning and Multivariate Time Series + Forecasting NeurIPS 2023 + + +
+ Multivariate time series is prevalent in many scientific and industrial +domains. Modeling multivariate signals is challenging due to their long-range +temporal dependencies and intricate interactions--both direct and indirect. To +confront these complexities, we introduce a method of representing multivariate +signals as nodes in a graph with edges indicating interdependency between them. +Specifically, we leverage graph neural networks (GNN) and attention mechanisms +to efficiently learn the underlying relationships within the time series data. +Moreover, we suggest employing hierarchical signal decompositions running over +the graphs to capture multiple spatial dependencies. The effectiveness of our +proposed model is evaluated across various real-world benchmark datasets +designed for long-term forecasting tasks. The results consistently showcase the +superiority of our model, achieving an average 23\% reduction in mean squared +error (MSE) compared to existing models. + +
+
+ comment: Temporal Graph Learning Workshop @ NeurIPS 2023, New Orleans, United + States +
+
+
+
+
+ + ☆ Bridging Algorithmic Information Theory and Machine Learning: A New + Approach to Kernel Learning + + +
+ Machine Learning (ML) and Algorithmic Information Theory (AIT) look at +Complexity from different points of view. We explore the interface between AIT +and Kernel Methods (that are prevalent in ML) by adopting an AIT perspective on +the problem of learning kernels from data, in kernel ridge regression, through +the method of Sparse Kernel Flows. In particular, by looking at the differences +and commonalities between Minimal Description Length (MDL) and Regularization +in Machine Learning (RML), we prove that the method of Sparse Kernel Flows is +the natural approach to adopt to learn kernels from data. This paper shows that +it is not necessary to use the statistical route to derive Sparse Kernel Flows +and that one can directly work with code-lengths and complexities that are +concepts that show up in AIT. + +
+
+ comment: An earlier version of this paper appeared at + https://www.researchgate.net/publication/371875631_A_note_on_learning_kernels_from_data_from_an_Algorithmic_Information_Theoretic_point_of_view. + arXiv admin note: text overlap with arXiv:2111.13037, arXiv:2007.05074 +
+
+
+
+
+ + ☆ Koopman Learning with Episodic Memory + + +
+ Koopman operator theory, a data-driven dynamical systems framework, has found +significant success in learning models from complex, real-world data sets, +enabling state-of-the-art prediction and control. The greater interpretability +and lower computational costs of these models, compared to traditional machine +learning methodologies, make Koopman learning an especially appealing approach. +Despite this, little work has been performed on endowing Koopman learning with +the ability to learn from its own mistakes. To address this, we equip Koopman +methods - developed for predicting non-stationary time-series - with an +episodic memory mechanism, enabling global recall of (or attention to) periods +in time where similar dynamics previously occurred. We find that a basic +implementation of Koopman learning with episodic memory leads to significant +improvements in prediction on synthetic and real-world data. Our framework has +considerable potential for expansion, allowing for future advances, and opens +exciting new directions for Koopman learning. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Decentralised Q-Learning for Multi-Agent Markov Decision Processes with + a Satisfiability Criterion + + +
+ In this paper, we propose a reinforcement learning algorithm to solve a +multi-agent Markov decision process (MMDP). The goal, inspired by Blackwell's +Approachability Theorem, is to lower the time average cost of each agent to +below a pre-specified agent-specific bound. For the MMDP, we assume the state +dynamics to be controlled by the joint actions of agents, but the per-stage +costs to only depend on the individual agent's actions. We combine the +Q-learning algorithm for a weighted combination of the costs of each agent, +obtained by a gossip algorithm with the Metropolis-Hastings or Multiplicative +Weights formalisms to modulate the averaging matrix of the gossip. We use +multiple timescales in our algorithm and prove that under mild conditions, it +approximately achieves the desired bounds for each of the agents. We also +demonstrate the empirical performance of this algorithm in the more general +setting of MMDPs having jointly controlled per-stage costs. + +
+
+
+
+
+ + ☆ A New Type Of Upper And Lower Bounds On Right-Tail Probabilities Of + Continuous Random Variables + + +
+ In this paper, I present a completely new type of upper and lower bounds on +the right-tail probabilities of continuous random variables with unbounded +support and with semi-bounded support from the left. The presented upper and +lower right-tail bounds depend only on the probability density function (PDF), +its first derivative, and two parameters that are used for tightening the +bounds. These tail bounds hold under certain conditions that depend on the PDF, +its first and second derivatives, and the two parameters. The new tail bounds +are shown to be tight for a wide range of continuous random variables via +numerical examples. + +
+
+
+
+
+ + ☆ TouchSDF: A DeepSDF Approach for 3D Shape Reconstruction using + Vision-Based Tactile Sensing + + +
+ Humans rely on their visual and tactile senses to develop a comprehensive 3D +understanding of their physical environment. Recently, there has been a growing +interest in exploring and manipulating objects using data-driven approaches +that utilise high-resolution vision-based tactile sensors. However, 3D shape +reconstruction using tactile sensing has lagged behind visual shape +reconstruction because of limitations in existing techniques, including the +inability to generalise over unseen shapes, the absence of real-world testing, +and limited expressive capacity imposed by discrete representations. To address +these challenges, we propose TouchSDF, a Deep Learning approach for tactile 3D +shape reconstruction that leverages the rich information provided by a +vision-based tactile sensor and the expressivity of the implicit neural +representation DeepSDF. Our technique consists of two components: (1) a +Convolutional Neural Network that maps tactile images into local meshes +representing the surface at the touch location, and (2) an implicit neural +function that predicts a signed distance function to extract the desired 3D +shape. This combination allows TouchSDF to reconstruct smooth and continuous 3D +shapes from tactile inputs in simulation and real-world settings, opening up +research avenues for robust 3D-aware representations and improved multimodal +perception in robotics. Code and supplementary material are available at: +https://touchsdf.github.io/ + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Deep learning-based detection of morphological features associated with + hypoxia in H&E breast cancer whole slide images + + +
+ Hypoxia occurs when tumour cells outgrow their blood supply, leading to +regions of low oxygen levels within the tumour. Calculating hypoxia levels can +be an important step in understanding the biology of tumours, their clinical +progression and response to treatment. This study demonstrates a novel +application of deep learning to evaluate hypoxia in the context of breast +cancer histomorphology. More precisely, we show that Weakly Supervised Deep +Learning (WSDL) models can accurately detect hypoxia associated features in +routine Hematoxylin and Eosin (H&E) whole slide images (WSI). We trained and +evaluated a deep Multiple Instance Learning model on tiles from WSI H&E tissue +from breast cancer primary sites (n=240) obtaining on average an AUC of 0.87 on +a left-out test set. We also showed significant differences between features of +hypoxic and normoxic tissue regions as distinguished by the WSDL models. Such +DL hypoxia H&E WSI detection models could potentially be extended to other +tumour types and easily integrated into the pathology workflow without +requiring additional costly assays. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ ChronoPscychosis: Temporal Segmentation and Its Impact on Schizophrenia + Classification Using Motor Activity Data + + +
+ Schizophrenia is a complicated mental illness characterized by a broad +spectrum of symptoms affecting cognition, behavior, and emotion. The task of +identifying reliable biomarkers to classify Schizophrenia accurately continues +to be a challenge in the field of psychiatry. We investigate the temporal +patterns within the motor activity data as a potential key to enhancing the +categorization of individuals with Schizophrenia, using the dataset having +motor activity recordings of 22 Schizophrenia patients and 32 control subjects. +The dataset contains per-minute motor activity measurements collected for an +average of 12.7 days in a row for each participant. We dissect each day into +segments (Twelve, Eight, six, four, three, and two parts) and evaluate their +impact on classification. We employ sixteen statistical features within these +temporal segments and train them on Seven machine learning models to get deeper +insights. LightGBM model outperforms the other six models. Our results indicate +that the temporal segmentation significantly improves the classification, with +AUC-ROC = 0.93, F1 score = 0.84( LightGBM- without any segmentation) and +AUC-ROC = 0.98, F1 score = 0.93( LightGBM- with segmentation). Distinguishing +between diurnal and nocturnal segments amplifies the differences between +Schizophrenia patients and controls. However, further subdivisions into smaller +time segments do not affect the AUC- ROC significantly. Morning, afternoon, +evening, and night partitioning gives similar classification performance to +day-night partitioning. These findings are valuable as they indicate that +extensive temporal classification beyond distinguishing between day and night +does not yield substantial results, offering an efficient approach for further +classification, early diagnosis, and monitoring of Schizophrenia. + +
+
+
+
+
+ + ☆ Improving Source-Free Target Adaptation with Vision Transformers + Leveraging Domain Representation Images + + +
+ Unsupervised Domain Adaptation (UDA) methods facilitate knowledge transfer +from a labeled source domain to an unlabeled target domain, navigating the +obstacle of domain shift. While Convolutional Neural Networks (CNNs) are a +staple in UDA, the rise of Vision Transformers (ViTs) provides new avenues for +domain generalization. This paper presents an innovative method to bolster ViT +performance in source-free target adaptation, beginning with an evaluation of +how key, query, and value elements affect ViT outcomes. Experiments indicate +that altering the key component has negligible effects on Transformer +performance. Leveraging this discovery, we introduce Domain Representation +Images (DRIs), feeding embeddings through the key element. DRIs act as +domain-specific markers, effortlessly merging with the training regimen. To +assess our method, we perform target adaptation tests on the Cross Instance DRI +source-only (SO) control. We measure the efficacy of target adaptation with and +without DRIs, against existing benchmarks like SHOT-B* and adaptations via +CDTrans. Findings demonstrate that excluding DRIs offers limited gains over +SHOT-B*, while their inclusion in the key segment boosts average precision +promoting superior domain generalization. This research underscores the vital +role of DRIs in enhancing ViT efficiency in UDA scenarios, setting a precedent +for further domain adaptation explorations. + +
+
+
+
+
+ + ☆ Machine-Guided Discovery of a Real-World Rogue Wave Model + + +
+ Big data and large-scale machine learning have had a profound impact on +science and engineering, particularly in fields focused on forecasting and +prediction. Yet, it is still not clear how we can use the superior pattern +matching abilities of machine learning models for scientific discovery. This is +because the goals of machine learning and science are generally not aligned. In +addition to being accurate, scientific theories must also be causally +consistent with the underlying physical process and allow for human analysis, +reasoning, and manipulation to advance the field. In this paper, we present a +case study on discovering a new symbolic model for oceanic rogue waves from +data using causal analysis, deep learning, parsimony-guided model selection, +and symbolic regression. We train an artificial neural network on causal +features from an extensive dataset of observations from wave buoys, while +selecting for predictive performance and causal invariance. We apply symbolic +regression to distill this black-box model into a mathematical equation that +retains the neural network's predictive capabilities, while allowing for +interpretation in the context of existing wave theory. The resulting model +reproduces known behavior, generates well-calibrated probabilities, and +achieves better predictive scores on unseen data than current theory. This +showcases how machine learning can facilitate inductive scientific discovery, +and paves the way for more accurate rogue wave forecasting. + +
+
+
+
+
+ + ☆ Moderating Model Marketplaces: Platform Governance Puzzles for AI + Intermediaries + + +
+ The AI development community is increasingly making use of hosting +intermediaries such as Hugging Face provide easy access to user-uploaded models +and training data. These model marketplaces lower technical deployment barriers +for hundreds of thousands of users, yet can be used in numerous potentially +harmful and illegal ways. In this article, we explain ways in which AI systems, +which can both `contain' content and be open-ended tools, present one of the +trickiest platform governance challenges seen to date. We provide case studies +of several incidents across three illustrative platforms -- Hugging Face, +GitHub and Civitai -- to examine how model marketplaces moderate models. +Building on this analysis, we outline important (and yet nevertheless limited) +practices that industry has been developing to respond to moderation demands: +licensing, access and use restrictions, automated content moderation, and open +policy development. While the policy challenge at hand is a considerable one, +we conclude with some ideas as to how platforms could better mobilize resources +to act as a careful, fair, and proportionate regulatory access point. + +
+
+
+
+
+ + ☆ BEND: Benchmarking DNA Language Models on biologically meaningful tasks + + +
+ The genome sequence contains the blueprint for governing cellular processes. +While the availability of genomes has vastly increased over the last decades, +experimental annotation of the various functional, non-coding and regulatory +elements encoded in the DNA sequence remains both expensive and challenging. +This has sparked interest in unsupervised language modeling of genomic DNA, a +paradigm that has seen great success for protein sequence data. Although +various DNA language models have been proposed, evaluation tasks often differ +between individual works, and might not fully recapitulate the fundamental +challenges of genome annotation, including the length, scale and sparsity of +the data. In this study, we introduce BEND, a Benchmark for DNA language +models, featuring a collection of realistic and biologically meaningful +downstream tasks defined on the human genome. We find that embeddings from +current DNA LMs can approach performance of expert methods on some tasks, but +only capture limited information about long-range features. BEND is available +at https://github.com/frederikkemarin/BEND. + +
+
+ comment: 10 pages, 1 figure, 3 tables, code available at + https://github.com/frederikkemarin/BEND +
+
+
+
+
+ + ☆ Differentiable Sampling of Categorical Distributions Using the + CatLog-Derivative Trick + + +
+ Categorical random variables can faithfully represent the discrete and +uncertain aspects of data as part of a discrete latent variable model. Learning +in such models necessitates taking gradients with respect to the parameters of +the categorical probability distributions, which is often intractable due to +their combinatorial nature. A popular technique to estimate these otherwise +intractable gradients is the Log-Derivative trick. This trick forms the basis +of the well-known REINFORCE gradient estimator and its many extensions. While +the Log-Derivative trick allows us to differentiate through samples drawn from +categorical distributions, it does not take into account the discrete nature of +the distribution itself. Our first contribution addresses this shortcoming by +introducing the CatLog-Derivative trick - a variation of the Log-Derivative +trick tailored towards categorical distributions. Secondly, we use the +CatLog-Derivative trick to introduce IndeCateR, a novel and unbiased gradient +estimator for the important case of products of independent categorical +distributions with provably lower variance than REINFORCE. Thirdly, we +empirically show that IndeCateR can be efficiently implemented and that its +gradient estimates have significantly lower bias and variance for the same +number of samples compared to the state of the art. + +
+
+
+
+
+ + ☆ Variational Elliptical Processes + + +
+ We present elliptical processes, a family of non-parametric probabilistic +models that subsume Gaussian processes and Student's t processes. This +generalization includes a range of new heavy-tailed behaviors while retaining +computational tractability. Elliptical processes are based on a representation +of elliptical distributions as a continuous mixture of Gaussian distributions. +We parameterize this mixture distribution as a spline normalizing flow, which +we train using variational inference. The proposed form of the variational +posterior enables a sparse variational elliptical process applicable to +large-scale problems. We highlight advantages compared to Gaussian processes +through regression and classification experiments. Elliptical processes can +supersede Gaussian processes in several settings, including cases where the +likelihood is non-Gaussian or when accurate tail modeling is essential. + +
+
+ comment: 14 pages, 15 figures, appendix 9 pages +
+
+
+
+
+ + ☆ Summary of the DISPLACE Challenge 2023 -- DIarization of SPeaker and + LAnguage in Conversational Environments + + +
+ In multi-lingual societies, where multiple languages are spoken in a small +geographic vicinity, informal conversations often involve mix of languages. +Existing speech technologies may be inefficient in extracting information from +such conversations, where the speech data is rich in diversity with multiple +languages and speakers. The DISPLACE (DIarization of SPeaker and LAnguage in +Conversational Environments) challenge constitutes an open-call for evaluating +and bench-marking the speaker and language diarization technologies on this +challenging condition. The challenge entailed two tracks: Track-1 focused on +speaker diarization (SD) in multilingual situations while, Track-2 addressed +the language diarization (LD) in a multi-speaker scenario. Both the tracks were +evaluated using the same underlying audio data. To facilitate this evaluation, +a real-world dataset featuring multilingual, multi-speaker conversational +far-field speech was recorded and distributed. Furthermore, a baseline system +was made available for both SD and LD task which mimicked the state-of-art in +these tasks. The challenge garnered a total of $42$ world-wide registrations +and received a total of $19$ combined submissions for Track-1 and Track-2. This +paper describes the challenge, details of the datasets, tasks, and the baseline +system. Additionally, the paper provides a concise overview of the submitted +systems in both tracks, with an emphasis given to the top performing systems. +The paper also presents insights and future perspectives for SD and LD tasks, +focusing on the key challenges that the systems need to overcome before +wide-spread commercial deployment on such conversations. + +
+
+
+
+
+ + ☆ Convolutional Neural Networks for Neuroimaging in Parkinson's Disease: + Is Preprocessing Needed? + + +
+ Spatial and intensity normalization are nowadays a prerequisite for +neuroimaging analysis. Influenced by voxel-wise and other univariate +comparisons, where these corrections are key, they are commonly applied to any +type of analysis and imaging modalities. Nuclear imaging modalities such as +PET-FDG or FP-CIT SPECT, a common modality used in Parkinson's Disease +diagnosis, are especially dependent on intensity normalization. However, these +steps are computationally expensive and furthermore, they may introduce +deformations in the images, altering the information contained in them. +Convolutional Neural Networks (CNNs), for their part, introduce position +invariance to pattern recognition, and have been proven to classify objects +regardless of their orientation, size, angle, etc. Therefore, a question +arises: how well can CNNs account for spatial and intensity differences when +analysing nuclear brain imaging? Are spatial and intensity normalization still +needed? To answer this question, we have trained four different CNN models +based on well-established architectures, using or not different spatial and +intensity normalization preprocessing. The results show that a sufficiently +complex model such as our three-dimensional version of the ALEXNET can +effectively account for spatial differences, achieving a diagnosis accuracy of +94.1% with an area under the ROC curve of 0.984. The visualization of the +differences via saliency maps shows that these models are correctly finding +patterns that match those found in the literature, without the need of applying +any complex spatial normalization procedure. However, the intensity +normalization -- and its type -- is revealed as very influential in the results +and accuracy of the trained model, and therefore must be well accounted. + +
+
+ comment: 19 pages, 7 figures +
+
+
+
+
+ + ☆ Explainable Anomaly Detection using Masked Latent Generative Modeling + + +
+ We present a novel time series anomaly detection method that achieves +excellent detection accuracy while offering a superior level of explainability. +Our proposed method, TimeVQVAE-AD, leverages masked generative modeling adapted +from the cutting-edge time series generation method known as TimeVQVAE. The +prior model is trained on the discrete latent space of a time-frequency domain. +Notably, the dimensional semantics of the time-frequency domain are preserved +in the latent space, enabling us to compute anomaly scores across different +frequency bands, which provides a better insight into the detected anomalies. +Additionally, the generative nature of the prior model allows for sampling +likely normal states for detected anomalies, enhancing the explainability of +the detected anomalies through counterfactuals. Our experimental evaluation on +the UCR Time Series Anomaly archive demonstrates that TimeVQVAE-AD +significantly surpasses the existing methods in terms of detection accuracy and +explainability. + +
+
+
+
+
+ + ☆ In-Context Learning Functions with Varying Number of Minima + + +
+ Large Language Models (LLMs) have proven effective at In-Context Learning +(ICL), an ability that allows them to create predictors from labeled examples. +Few studies have explored the interplay between ICL and specific properties of +functions it attempts to approximate. In our study, we use a formal framework +to explore ICL and propose a new task of approximating functions with varying +number of minima. We implement a method that allows for producing functions +with given inputs as minima. We find that increasing the number of minima +degrades ICL performance. At the same time, our evaluation shows that ICL +outperforms 2-layer Neural Network (2NN) model. Furthermore, ICL learns faster +than 2NN in all settings. We validate the findings through a set of few-shot +experiments across various hyperparameter configurations. + +
+
+
+
+
+ + ☆ An efficient likelihood-free Bayesian inference method based on + sequential neural posterior estimation + + +
+ Sequential neural posterior estimation (SNPE) techniques have been recently +proposed for dealing with simulation-based models with intractable likelihoods. +Unlike approximate Bayesian computation, SNPE techniques learn the posterior +from sequential simulation using neural network-based conditional density +estimators. This paper reclaims SNPE-B proposed by Lueckmann et al. (2017), +which suffers from inefficiency and slow inference due to inefficient +utilization of simulated data and high variance of parameter updates. To +address these issues, we firstly introduce a concentrated loss function based +on an adaptive calibration kernel that reweights the simulated data +appropriately to improve the data efficiency. Moreover, we provide a +theoretical analysis of the variance of associated Monte Carlo estimators. +Based on this analysis, we then propose several variance reduction techniques +to further accelerate the process of learning. Numerical experiments +demonstrate that our method outperforms the original method together with other +existing competitors on certain tasks. + +
+
+ comment: 29 pages, 7 figures +
+
+
+
+
+ + ☆ Inverse Problems with Learned Forward Operators + + +
+ Solving inverse problems requires knowledge of the forward operator, but +accurate models can be computationally expensive and hence cheaper variants are +desired that do not compromise reconstruction quality. This chapter reviews +reconstruction methods in inverse problems with learned forward operators that +follow two different paradigms. The first one is completely agnostic to the +forward operator and learns its restriction to the subspace spanned by the +training data. The framework of regularisation by projection is then used to +find a reconstruction. The second one uses a simplified model of the physics of +the measurement process and only relies on the training data to learn a model +correction. We present the theory of these two approaches and compare them +numerically. A common theme emerges: both methods require, or at least benefit +from, training data not only for the forward operator, but also for its +adjoint. + +
+
+
+
+
+ + ☆ Neural Network Pruning by Gradient Descent + + +
+ The rapid increase in the parameters of deep learning models has led to +significant costs, challenging computational efficiency and model +interpretability. In this paper, we introduce a novel and straightforward +neural network pruning framework that incorporates the Gumbel-Softmax +technique. This framework enables the simultaneous optimization of a network's +weights and topology in an end-to-end process using stochastic gradient +descent. Empirical results demonstrate its exceptional compression capability, +maintaining high accuracy on the MNIST dataset with only 0.15\% of the original +network parameters. Moreover, our framework enhances neural network +interpretability, not only by allowing easy extraction of feature importance +directly from the pruned network but also by enabling visualization of feature +symmetry and the pathways of information propagation from features to outcomes. +Although the pruning strategy is learned through deep learning, it is +surprisingly intuitive and understandable, focusing on selecting key +representative features and exploiting data patterns to achieve extreme sparse +pruning. We believe our method opens a promising new avenue for deep learning +pruning and the creation of interpretable machine learning systems. + +
+
+ comment: 21 pages, 5 figures +
+
+
+
+
+ + ☆ ALPHA: AnomaLous Physiological Health Assessment Using Large Language + Models + + +
+ This study concentrates on evaluating the efficacy of Large Language Models +(LLMs) in healthcare, with a specific focus on their application in personal +anomalous health monitoring. Our research primarily investigates the +capabilities of LLMs in interpreting and analyzing physiological data obtained +from FDA-approved devices. We conducted an extensive analysis using anomalous +physiological data gathered in a simulated low-air-pressure plateau +environment. This allowed us to assess the precision and reliability of LLMs in +understanding and evaluating users' health status with notable specificity. Our +findings reveal that LLMs exhibit exceptional performance in determining +medical indicators, including a Mean Absolute Error (MAE) of less than 1 beat +per minute for heart rate and less than 1% for oxygen saturation (SpO2). +Furthermore, the Mean Absolute Percentage Error (MAPE) for these evaluations +remained below 1%, with the overall accuracy of health assessments surpassing +85%. In image analysis tasks, such as interpreting photoplethysmography (PPG) +data, our specially adapted GPT models demonstrated remarkable proficiency, +achieving less than 1 bpm error in cycle count and 7.28 MAE for heart rate +estimation. This study highlights LLMs' dual role as health data analysis tools +and pivotal elements in advanced AI health assistants, offering personalized +health insights and recommendations within the future health assistant +framework. + +
+
+
+
+
+ + ☆ Fair Polylog-Approximate Low-Cost Hierarchical Clustering NeurIPS '23 + + +
+ Research in fair machine learning, and particularly clustering, has been +crucial in recent years given the many ethical controversies that modern +intelligent systems have posed. Ahmadian et al. [2020] established the study of +fairness in \textit{hierarchical} clustering, a stronger, more structured +variant of its well-known flat counterpart, though their proposed algorithm +that optimizes for Dasgupta's [2016] famous cost function was highly +theoretical. Knittel et al. [2023] then proposed the first practical fair +approximation for cost, however they were unable to break the +polynomial-approximate barrier they posed as a hurdle of interest. We break +this barrier, proposing the first truly polylogarithmic-approximate low-cost +fair hierarchical clustering, thus greatly bridging the gap between the best +fair and vanilla hierarchical clustering approximations. + +
+
+ comment: Accepted to NeurIPS '23 (16 pages, 5 figures) +
+
+
+
+
+ + ☆ Multi-Objective Reinforcement Learning based on Decomposition: A + taxonomy and framework + + +
+ Multi-objective reinforcement learning (MORL) extends traditional RL by +seeking policies making different compromises among conflicting objectives. The +recent surge of interest in MORL has led to diverse studies and solving +methods, often drawing from existing knowledge in multi-objective optimization +based on decomposition (MOO/D). Yet, a clear categorization based on both RL +and MOO/D is lacking in the existing literature. Consequently, MORL researchers +face difficulties when trying to classify contributions within a broader +context due to the absence of a standardized taxonomy. To tackle such an issue, +this paper introduces Multi-Objective Reinforcement Learning based on +Decomposition (MORL/D), a novel methodology bridging RL and MOO literature. A +comprehensive taxonomy for MORL/D is presented, providing a structured +foundation for categorizing existing and potential MORL works. The introduced +taxonomy is then used to scrutinize MORL research, enhancing clarity and +conciseness through well-defined categorization. Moreover, a flexible framework +derived from the taxonomy is introduced. This framework accommodates diverse +instantiations using tools from both RL and MOO/D. Implementation across +various configurations demonstrates its versatility, assessed against benchmark +problems. Results indicate MORL/D instantiations achieve comparable performance +with significantly greater versatility than current state-of-the-art +approaches. By presenting the taxonomy and framework, this paper offers a +comprehensive perspective and a unified vocabulary for MORL. This not only +facilitates the identification of algorithmic contributions but also lays the +groundwork for novel research avenues in MORL, contributing to the continued +advancement of this field. + +
+
+ comment: Under review at JAIR +
+
+
+
+
+ + ☆ Heuristics for Detecting CoinJoin Transactions on the Bitcoin Blockchain + + +
+ This research delves into the intricacies of Bitcoin, a decentralized +peer-to-peer network, and its associated blockchain, which records all +transactions since its inception. While this ensures integrity and +transparency, the transparent nature of Bitcoin potentially compromises users' +privacy rights. To address this concern, users have adopted CoinJoin, a method +that amalgamates multiple transaction intents into a single, larger transaction +to bolster transactional privacy. This process complicates individual +transaction tracing and disrupts many established blockchain analysis +heuristics. Despite its significance, limited research has been conducted on +identifying CoinJoin transactions. Particularly noteworthy are varied CoinJoin +implementations such as JoinMarket, Wasabi, and Whirlpool, each presenting +distinct challenges due to their unique transaction structures. This study +delves deeply into the open-source implementations of these protocols, aiming +to develop refined heuristics for identifying their transactions on the +blockchain. Our exhaustive analysis covers transactions up to block 760,000, +offering a comprehensive insight into CoinJoin transactions and their +implications for Bitcoin blockchain analysis. + +
+
+
+
+
+ + ☆ Hyb-NeRF: A Multiresolution Hybrid Encoding for Neural Radiance Fields WACV2024 + + +
+ Recent advances in Neural radiance fields (NeRF) have enabled high-fidelity +scene reconstruction for novel view synthesis. However, NeRF requires hundreds +of network evaluations per pixel to approximate a volume rendering integral, +making it slow to train. Caching NeRFs into explicit data structures can +effectively enhance rendering speed but at the cost of higher memory usage. To +address these issues, we present Hyb-NeRF, a novel neural radiance field with a +multi-resolution hybrid encoding that achieves efficient neural modeling and +fast rendering, which also allows for high-quality novel view synthesis. The +key idea of Hyb-NeRF is to represent the scene using different encoding +strategies from coarse-to-fine resolution levels. Hyb-NeRF exploits +memory-efficiency learnable positional features at coarse resolutions and the +fast optimization speed and local details of hash-based feature grids at fine +resolutions. In addition, to further boost performance, we embed cone +tracing-based features in our learnable positional encoding that eliminates +encoding ambiguity and reduces aliasing artifacts. Extensive experiments on +both synthetic and real-world datasets show that Hyb-NeRF achieves faster +rendering speed with better rending quality and even a lower memory footprint +in comparison to previous state-of-the-art methods. + +
+
+ comment: WACV2024 +
+
+
+
+
+ + ☆ MaskFlow: Object-Aware Motion Estimation + + +
+ We introduce a novel motion estimation method, MaskFlow, that is capable of +estimating accurate motion fields, even in very challenging cases with small +objects, large displacements and drastic appearance changes. In addition to +lower-level features, that are used in other Deep Neural Network (DNN)-based +motion estimation methods, MaskFlow draws from object-level features and +segmentations. These features and segmentations are used to approximate the +objects' translation motion field. We propose a novel and effective way of +incorporating the incomplete translation motion field into a subsequent motion +estimation network for refinement and completion. We also produced a new +challenging synthetic dataset with motion field ground truth, and also provide +extra ground truth for the object-instance matchings and corresponding +segmentation masks. We demonstrate that MaskFlow outperforms state of the art +methods when evaluated on our new challenging dataset, whilst still producing +comparable results on the popular FlyingThings3D benchmark dataset. + +
+
+
+
+
+ + ☆ Harnessing FPGA Technology for Enhanced Biomedical Computation + + +
+ This research delves into sophisticated neural network frameworks like +Convolutional Neural Networks (CNN), Recurrent Neural Networks (RNN), Long +Short-Term Memory Networks (LSTMs), and Deep Belief Networks (DBNs) for +improved analysis of ECG signals via Field Programmable Gate Arrays (FPGAs). +The MIT-BIH Arrhythmia Database serves as the foundation for training and +evaluating our models, with added Gaussian noise to heighten the algorithms' +resilience. The developed architectures incorporate various layers for specific +processing and categorization functions, employing strategies such as the +EarlyStopping callback and Dropout layer to prevent overfitting. Additionally, +this paper details the creation of a tailored Tensor Compute Unit (TCU) +accelerator for the PYNQ Z1 platform. It provides a thorough methodology for +implementing FPGA-based machine learning, encompassing the configuration of the +Tensil toolchain in Docker, selection of architectures, PS-PL configuration, +and the compilation and deployment of models. By evaluating performance +indicators like latency and throughput, we showcase the efficacy of FPGAs in +advanced biomedical computing. This study ultimately serves as a comprehensive +guide to optimizing neural network operations on FPGAs across various fields. + +
+
+ comment: Submitted to IEEE Transactions on Biomedical Circuits and Systems. + arXiv admin note: substantial text overlap with arXiv:2307.07914 +
+
+
+
+
+ + ☆ Classifier Calibration with ROC-Regularized Isotonic Regression + + +
+ Calibration of machine learning classifiers is necessary to obtain reliable +and interpretable predictions, bridging the gap between model confidence and +actual probabilities. One prominent technique, isotonic regression (IR), aims +at calibrating binary classifiers by minimizing the cross entropy on a +calibration set via monotone transformations. IR acts as an adaptive binning +procedure, which allows achieving a calibration error of zero, but leaves open +the issue of the effect on performance. In this paper, we first prove that IR +preserves the convex hull of the ROC curve -- an essential performance metric +for binary classifiers. This ensures that a classifier is calibrated while +controlling for overfitting of the calibration set. We then present a novel +generalization of isotonic regression to accommodate classifiers with K +classes. Our method constructs a multidimensional adaptive binning scheme on +the probability simplex, again achieving a multi-class calibration error equal +to zero. We regularize this algorithm by imposing a form of monotony that +preserves the K-dimensional ROC surface of the classifier. We show empirically +that this general monotony criterion is effective in striking a balance between +reducing cross entropy loss and avoiding overfitting of the calibration set. + +
+
+
+
+
+ + ☆ Fair Enough? A map of the current limitations of the requirements to + have "fair'' algorithms + + +
+ In the recent years, the raise in the usage and efficiency of Artificial +Intelligence and, more in general, of Automated Decision-Making systems has +brought with it an increasing and welcome awareness of the risks associated +with such systems. One of such risks is that of perpetuating or even amplifying +bias and unjust disparities present in the data from which many of these +systems learn to adjust and optimise their decisions. This awareness has on one +side encouraged several scientific communities to come up with more and more +appropriate ways and methods to assess, quantify, and possibly mitigate such +biases and disparities. On the other hand, it has prompted more and more layers +of society, including policy makers, to call for ``fair'' algorithms. We +believe that while a lot of excellent and multidisciplinary research is +currently being conducted, what is still fundamentally missing is the awareness +that having ``fair'' algorithms is per s\'e a nearly meaningless requirement, +that needs to be complemented with a lot of additional societal choices to +become actionable. Namely, there is a hiatus between what the society is +demanding from Automated Decision-Making systems, and what this demand actually +means in real-world scenarios. In this work, we outline the key features of +such a hiatus, and pinpoint a list of fundamental ambiguities and attention +points that we as a society must address in order to give a concrete meaning to +the increasing demand of fairness in Automated Decision-Making systems. + +
+
+ comment: 20 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Looped Transformers are Better at Learning Learning Algorithms + + +
+ Transformers have demonstrated effectiveness in \emph{in-context solving} +data-fitting problems from various (latent) models, as reported by Garg et al. +However, the absence of an inherent iterative structure in the transformer +architecture presents a challenge in emulating the iterative algorithms, which +are commonly employed in traditional machine learning methods. To address this, +we propose the utilization of \emph{looped} transformer architecture and its +associated training methodology, with the aim of incorporating iterative +characteristics into the transformer architectures. Experimental results +suggest that the looped transformer achieves performance comparable to the +standard transformer in solving various data-fitting problems, while utilizing +less than 10\% of the parameter count. + +
+
+
+
+
+ + ☆ Board-to-Board: Evaluating Moonboard Grade Prediction Generalization + + +
+ Bouldering is a sport where athletes aim to climb up an obstacle using a set +of defined holds called a route. Typically routes are assigned a grade to +inform climbers of its difficulty and allow them to more easily track their +progression. However, the variation in individual climbers technical and +physical attributes and many nuances of an individual route make grading a +difficult and often biased task. In this work, we apply classical and +deep-learning modelling techniques to the 2016, 2017 and 2019 Moonboard +datasets, achieving state of the art grade prediction performance with 0.87 MAE +and 1.12 RMSE. We achieve this performance on a feature-set that does not +require decomposing routes into individual moves, which is a method common in +literature and introduces bias. We also demonstrate the generalization +capability of this model between editions and introduce a novel vision-based +method of grade prediction. While the generalization performance of these +techniques is below human level performance currently, we propose these methods +as a basis for future work. Such a tool could be implemented in pre-existing +mobile applications and would allow climbers to better track their progress and +assess new routes with reduced bias. + +
+
+
+
+
+ + ☆ nach0: Multimodal Natural and Chemical Languages Foundation Model + + +
+ Large Language Models (LLMs) have substantially driven scientific progress in +various domains, and many papers have demonstrated their ability to tackle +complex problems with creative solutions. Our paper introduces a new foundation +model, nach0, capable of solving various chemical and biological tasks: +biomedical question answering, named entity recognition, molecular generation, +molecular synthesis, attributes prediction, and others. nach0 is a multi-domain +and multi-task encoder-decoder LLM pre-trained on unlabeled text from +scientific literature, patents, and molecule strings to incorporate a range of +chemical and linguistic knowledge. We employed instruction tuning, where +specific task-related instructions are utilized to fine-tune nach0 for the +final set of tasks. To train nach0 effectively, we leverage the NeMo framework, +enabling efficient parallel optimization of both base and large model versions. +Extensive experiments demonstrate that our model outperforms state-of-the-art +baselines on single-domain and cross-domain tasks. Furthermore, it can generate +high-quality outputs in molecular and textual formats, showcasing its +effectiveness in multi-domain setups. + +
+
+ comment: Submitted to Nature Communications +
+
+
+
+
+ + ☆ A Survey of Graph Meets Large Language Model: Progress and Future + Directions + + +
+ Graph plays a significant role in representing and analyzing complex +relationships in real-world applications such as citation networks, social +networks, and biological data. Recently, Large Language Models (LLMs), which +have achieved tremendous success in various domains, have also been leveraged +in graph-related tasks to surpass traditional Graph Neural Networks (GNNs) +based methods and yield state-of-the-art performance. In this survey, we first +present a comprehensive review and analysis of existing methods that integrate +LLMs with graphs. First of all, we propose a new taxonomy, which organizes +existing methods into three categories based on the role (i.e., enhancer, +predictor, and alignment component) played by LLMs in graph-related tasks. Then +we systematically survey the representative methods along the three categories +of the taxonomy. Finally, we discuss the remaining limitations of existing +studies and highlight promising avenues for future research. The relevant +papers are summarized and will be consistently updated at: +https://github.com/yhLeeee/Awesome-LLMs-in-Graph-tasks. + +
+
+ comment: Work in progress; 13 pages, 5 figures +
+
+
+
+
+ + ☆ Infinite forecast combinations based on Dirichlet process + + +
+ Forecast combination integrates information from various sources by +consolidating multiple forecast results from the target time series. Instead of +the need to select a single optimal forecasting model, this paper introduces a +deep learning ensemble forecasting model based on the Dirichlet process. +Initially, the learning rate is sampled with three basis distributions as +hyperparameters to convert the infinite mixture into a finite one. All +checkpoints are collected to establish a deep learning sub-model pool, and +weight adjustment and diversity strategies are developed during the combination +process. The main advantage of this method is its ability to generate the +required base learners through a single training process, utilizing the +decaying strategy to tackle the challenge posed by the stochastic nature of +gradient descent in determining the optimal learning rate. To ensure the +method's generalizability and competitiveness, this paper conducts an empirical +analysis using the weekly dataset from the M4 competition and explores +sensitivity to the number of models to be combined. The results demonstrate +that the ensemble model proposed offers substantial improvements in prediction +accuracy and stability compared to a single benchmark model. + +
+
+
+
+
+ + ☆ Post-Training Quantization with Low-precision Minifloats and Integers on + FPGAs + + +
+ Post-Training Quantization (PTQ) is a powerful technique for model +compression, reducing the precision of neural networks without additional +training overhead. Recent works have investigated adopting 8-bit floating-point +quantization (FP8) in the context of PTQ for model inference. However, the +exploration of floating-point formats smaller than 8 bits and their comparison +with integer quantization remains relatively limited. In this work, we present +minifloats, which are reduced-precision floating-point formats capable of +further reducing the memory footprint, latency, and energy cost of a model +while approaching full-precision model accuracy. Our work presents a novel PTQ +design-space exploration, comparing minifloat and integer quantization schemes +across a range of 3 to 8 bits for both weights and activations. We examine the +applicability of various PTQ techniques to minifloats, including weight +equalization, bias correction, SmoothQuant, gradient-based learned rounding, +and the GPTQ method. Our experiments validate the effectiveness of +low-precision minifloats when compared to their integer counterparts across a +spectrum of accuracy-precision trade-offs on a set of reference deep learning +vision workloads. Finally, we evaluate our results against an FPGA-based +hardware cost model, showing that integer quantization often remains the +Pareto-optimal option, given its relatively smaller hardware resource +footprint. + +
+
+
+
+
+ + ☆ Federated Learning via Consensus Mechanism on Heterogeneous Data: A New + Perspective on Convergence + + +
+ Federated learning (FL) on heterogeneous data (non-IID data) has recently +received great attention. Most existing methods focus on studying the +convergence guarantees for the global objective. While these methods can +guarantee the decrease of the global objective in each communication round, +they fail to ensure risk decrease for each client. In this paper, to address +the problem,we propose FedCOME, which introduces a consensus mechanism to +enforce decreased risk for each client after each training round. In +particular, we allow a slight adjustment to a client's gradient on the server +side, which generates an acute angle between the corrected gradient and the +original ones of other clients. We theoretically show that the consensus +mechanism can guarantee the convergence of the global objective. To generalize +the consensus mechanism to the partial participation FL scenario, we devise a +novel client sampling strategy to select the most representative clients for +the global data distribution. Training on these selected clients with the +consensus mechanism could empirically lead to risk decrease for clients that +are not selected. Finally, we conduct extensive experiments on four benchmark +datasets to show the superiority of FedCOME against other state-of-the-art +methods in terms of effectiveness, efficiency and fairness. For +reproducibility, we make our source code publicly available at: +\url{https://github.com/fedcome/fedcome}. + +
+
+
+
+
+ + ☆ Random Linear Projections Loss for Hyperplane-Based Optimization in + Regression Neural Networks + + +
+ Despite their popularity across a wide range of domains, regression neural +networks are prone to overfitting complex datasets. In this work, we propose a +loss function termed Random Linear Projections (RLP) loss, which is empirically +shown to mitigate overfitting. With RLP loss, the distance between sets of +hyperplanes connecting fixed-size subsets of the neural network's +feature-prediction pairs and feature-label pairs is minimized. The intuition +behind this loss derives from the notion that if two functions share the same +hyperplanes connecting all subsets of feature-label pairs, then these functions +must necessarily be equivalent. Our empirical studies, conducted across +benchmark datasets and representative synthetic examples, demonstrate the +improvements of the proposed RLP loss over mean squared error (MSE). +Specifically, neural networks trained with the RLP loss achieve better +performance while requiring fewer data samples and are more robust to additive +noise. We provide theoretical analysis supporting our empirical findings. + +
+
+
+
+
+ + ☆ Utilizing Language Models for Tour Itinerary Recommendation IJCAI 2023 + + +
+ Tour itinerary recommendation involves planning a sequence of relevant +Point-of-Interest (POIs), which combines challenges from the fields of both +Operations Research (OR) and Recommendation Systems (RS). As an OR problem, +there is the need to maximize a certain utility (e.g., popularity of POIs in +the tour) while adhering to some constraints (e.g., maximum time for the tour). +As a RS problem, it is heavily related to problem or filtering or ranking a +subset of POIs that are relevant to a user and recommending it as part of an +itinerary. In this paper, we explore the use of language models for the task of +tour itinerary recommendation and planning. This task has the unique +requirement of recommending personalized POIs relevant to users and planning +these POIs as an itinerary that satisfies various constraints. We discuss some +approaches in this area, such as using word embedding techniques like Word2Vec +and GloVe for learning POI embeddings and transformer-based techniques like +BERT for generating + itineraries. + +
+
+ comment: PMAI23 @IJCAI 2023 2nd International Workshop on Process Management + in the AI era +
+
+
+
+
+ + ☆ Advancing Transformer Architecture in Long-Context Large Language + Models: A Comprehensive Survey + + +
+ With the bomb ignited by ChatGPT, Transformer-based Large Language Models +(LLMs) have paved a revolutionary path toward Artificial General Intelligence +(AGI) and have been applied in diverse areas as knowledge bases, human +interfaces, and dynamic agents. However, a prevailing limitation exists: many +current LLMs, constrained by resources, are primarily pre-trained on shorter +texts, rendering them less effective for longer-context prompts, commonly +encountered in real-world settings. In this paper, we present a comprehensive +survey focusing on the advancement of model architecture in Transformer-based +LLMs to optimize long-context capabilities across all stages from pre-training +to inference. We firstly delineate and analyze the problems of handling +long-context input and output with the current Transformer-based models. Then, +we mainly offer a holistic taxonomy to navigate the landscape of Transformer +upgrades on architecture to solve these problems. Afterward, we provide the +investigation on wildly used evaluation necessities tailored for long-context +LLMs, including datasets, metrics, and baseline models, as well as some amazing +optimization toolkits like libraries, systems, and compilers to augment LLMs' +efficiency and efficacy across different stages. Finally, we further discuss +the predominant challenges and potential avenues for future research in this +domain. Additionally, we have established a repository where we curate relevant +literature with real-time updates at +https://github.com/Strivin0311/long-llms-learning. + +
+
+ comment: 35 pages, 3 figures, 4 tables +
+
+
+
+
+ + ☆ Stable Diffusion For Aerial Object Detection NeurIPS 2023 + + +
+ Aerial object detection is a challenging task, in which one major obstacle +lies in the limitations of large-scale data collection and the long-tail +distribution of certain classes. Synthetic data offers a promising solution, +especially with recent advances in diffusion-based methods like stable +diffusion (SD). However, the direct application of diffusion methods to aerial +domains poses unique challenges: stable diffusion's optimization for rich +ground-level semantics doesn't align with the sparse nature of aerial objects, +and the extraction of post-synthesis object coordinates remains problematic. To +address these challenges, we introduce a synthetic data augmentation framework +tailored for aerial images. It encompasses sparse-to-dense region of interest +(ROI) extraction to bridge the semantic gap, fine-tuning the diffusion model +with low-rank adaptation (LORA) to circumvent exhaustive retraining, and +finally, a Copy-Paste method to compose synthesized objects with backgrounds, +providing a nuanced approach to aerial object detection through synthetic data. + +
+
+ comment: Accepted at NeurIPS 2023 Synthetic Data Generation with Generative AI + workshop +
+
+
+
+
+ + ☆ Graph Neural Ordinary Differential Equations-based method for + Collaborative Filtering ICDM 2023 + + +
+ Graph Convolution Networks (GCNs) are widely considered state-of-the-art for +collaborative filtering. Although several GCN-based methods have been proposed +and achieved state-of-the-art performance in various tasks, they can be +computationally expensive and time-consuming to train if too many layers are +created. However, since the linear GCN model can be interpreted as a +differential equation, it is possible to transfer it to an ODE problem. This +inspired us to address the computational limitations of GCN-based models by +designing a simple and efficient NODE-based model that can skip some GCN layers +to reach the final state, thus avoiding the need to create many layers. In this +work, we propose a Graph Neural Ordinary Differential Equation-based method for +Collaborative Filtering (GODE-CF). This method estimates the final embedding by +utilizing the information captured by one or two GCN layers. To validate our +approach, we conducted experiments on multiple datasets. The results +demonstrate that our model outperforms competitive baselines, including +GCN-based models and other state-of-the-art CF methods. Notably, our proposed +GODE-CF model has several advantages over traditional GCN-based models. It is +simple, efficient, and has a fast training time, making it a practical choice +for real-world situations. + +
+
+ comment: Accepted by ICDM 2023 +
+
+
+
+
+ + ☆ Modeling Political Orientation of Social Media Posts: An Extended + Analysis + + +
+ Developing machine learning models to characterize political polarization on +online social media presents significant challenges. These challenges mainly +stem from various factors such as the lack of annotated data, presence of noise +in social media datasets, and the sheer volume of data. The common research +practice typically examines the biased structure of online user communities for +a given topic or qualitatively measuring the impacts of polarized topics on +social media. However, there is limited work focusing on analyzing polarization +at the ground-level, specifically in the social media posts themselves. Such +existing analysis heavily relies on annotated data, which often requires +laborious human labeling, offers labels only to specific problems, and lacks +the ability to determine the near-future bias state of a social media +conversations. Understanding the degree of political orientation conveyed in +social media posts is crucial for quantifying the bias of online user +communities and investigating the spread of polarized content. In this work, we +first introduce two heuristic methods that leverage on news media bias and post +content to label social media posts. Next, we compare the efficacy and quality +of heuristically labeled dataset with a randomly sampled human-annotated +dataset. Additionally, we demonstrate that current machine learning models can +exhibit improved performance in predicting political orientation of social +media posts, employing both traditional supervised learning and few-shot +learning setups. We conduct experiments using the proposed heuristic methods +and machine learning approaches to predict the political orientation of posts +collected from two social media forums with diverse political ideologies: Gab +and Twitter. + +
+
+
+
+
+ + ☆ IEKM: A Model Incorporating External Keyword Matrices + + +
+ A customer service platform system with a core text semantic similarity (STS) +task faces two urgent challenges: Firstly, one platform system needs to adapt +to different domains of customers, i.e., different domains adaptation (DDA). +Secondly, it is difficult for the model of the platform system to distinguish +sentence pairs that are literally close but semantically different, i.e., hard +negative samples. In this paper, we propose an incorporation external keywords +matrices model (IEKM) to address these challenges. The model uses external +tools or dictionaries to construct external matrices and fuses them to the +self-attention layers of the Transformer structure through gating units, thus +enabling flexible corrections to the model results. We evaluate the method on +multiple datasets and the results show that our method has improved performance +on all datasets. To demonstrate that our method can effectively solve all the +above challenges, we conduct a flexible correction experiment, which results in +an increase in the F1 value from 56.61 to 73.53. Our code will be publicly +available. + +
+
+
+
+
+ + ☆ Power grid operational risk assessment using graph neural network + surrogates + + +
+ We investigate the utility of graph neural networks (GNNs) as proxies of +power grid operational decision-making algorithms (optimal power flow (OPF) and +security-constrained unit commitment (SCUC)) to enable rigorous quantification +of the operational risk. To conduct principled risk analysis, numerous Monte +Carlo (MC) samples are drawn from the (foretasted) probability distributions of +spatio-temporally correlated stochastic grid variables. The corresponding OPF +and SCUC solutions, which are needed to quantify the risk, are generated using +traditional OPF and SCUC solvers to generate data for training GNN model(s). +The GNN model performance is evaluated in terms of the accuracy of predicting +quantities of interests (QoIs) derived from the decision variables in OPF and +SCUC. Specifically, we focus on thermal power generation and load shedding at +system and individual zone level. We also perform reliability and risk +quantification based on GNN predictions and compare with that obtained from +OPF/SCUC solutions. Our results demonstrate that GNNs are capable of providing +fast and accurate prediction of QoIs and thus can be good surrogate models for +OPF and SCUC. The excellent accuracy of GNN-based reliability and risk +assessment further suggests that GNN surrogate has the potential to be applied +in real-time and hours-ahead risk quantification. + +
+
+ comment: Manuscript submitted to IEEE PES GM 2024 +
+
+
+
+
+ + ☆ Discovering Effective Policies for Land-Use Planning + + +
+ How areas of land are allocated for different uses, such as forests, urban, +and agriculture, has a large effect on carbon balance, and therefore climate +change. Based on available historical data on changes in land use and a +simulation of carbon emissions/absorption, a surrogate model can be learned +that makes it possible to evaluate the different options available to +decision-makers efficiently. An evolutionary search process can then be used to +discover effective land-use policies for specific locations. Such a system was +built on the Project Resilience platform and evaluated with the Land-Use +Harmonization dataset and the BLUE simulator. It generates Pareto fronts that +trade off carbon impact and amount of change customized to different locations, +thus providing a potentially useful tool for land-use planning. + +
+
+
+
+
+ + ☆ Detecting subtle macroscopic changes in a finite temperature classical + scalar field with machine learning + + +
+ The ability to detect macroscopic changes is important for probing the +behaviors of experimental many-body systems from the classical to the quantum +realm. Although abrupt changes near phase boundaries can easily be detected, +subtle macroscopic changes are much more difficult to detect as the changes can +be obscured by noise. In this study, as a toy model for detecting subtle +macroscopic changes in many-body systems, we try to differentiate scalar field +samples at varying temperatures. We compare different methods for making such +differentiations, from physics method, statistics method, to AI method. Our +finding suggests that the AI method outperforms both the statistical method and +the physics method in its sensitivity. Our result provides a proof-of-concept +that AI can potentially detect macroscopic changes in many-body systems that +elude physical measures. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ☆ Mapping "Brain Coral" Regions on Mars using Deep Learning + + +
+ One of the main objectives of the Mars Exploration Program is to search for +evidence of past or current life on the planet. To achieve this, Mars +exploration has been focusing on regions that may have liquid or frozen water. +A set of critical areas may have seen cycles of ice thawing in the relatively +recent past in response to periodic changes in the obliquity of Mars. In this +work, we use convolutional neural networks to detect surface regions containing +"Brain Coral" terrain, a landform on Mars whose similarity in morphology and +scale to sorted stone circles on Earth suggests that it may have formed as a +consequence of freeze/thaw cycles. We use large images (~100-1000 megapixels) +from the Mars Reconnaissance Orbiter to search for these landforms at +resolutions close to a few tens of centimeters per pixel (~25--50 cm). Over +52,000 images (~28 TB) were searched (~5% of the Martian surface) where we +found detections in over 200 images. To expedite the processing we leverage a +classifier network (prior to segmentation) in the Fourier domain that can take +advantage of JPEG compression by leveraging blocks of coefficients from a +discrete cosine transform in lieu of decoding the entire image at the full +spatial resolution. The hybrid pipeline approach maintains ~93% accuracy while +cutting down on ~95% of the total processing time compared to running the +segmentation network at the full resolution on every image. The timely +processing of big data sets helps inform mission operations, geologic surveys +to prioritize candidate landing sites, avoid hazardous areas, or map the +spatial extent of certain terrain. The segmentation masks and source code are +available on Github for the community to explore and build upon. + +
+
+ comment: Submitted for publication, seeking comments from the community. Code + available: https://github.com/pearsonkyle/Mars-Brain-Coral-Network +
+
+
+
+
+ + ☆ A note on estimating the dimension from a random geometric graph + + +
+ Let $G_n$ be a random geometric graph with vertex set $[n]$ based on $n$ +i.i.d.\ random vectors $X_1,\ldots,X_n$ drawn from an unknown density $f$ on +$\R^d$. An edge $(i,j)$ is present when $\|X_i -X_j\| \le r_n$, for a given +threshold $r_n$ possibly depending upon $n$, where $\| \cdot \|$ denotes +Euclidean distance. We study the problem of estimating the dimension $d$ of the +underlying space when we have access to the adjacency matrix of the graph but +do not know $r_n$ or the vectors $X_i$. The main result of the paper is that +there exists an estimator of $d$ that converges to $d$ in probability as $n \to +\infty$ for all densities with $\int f^5 < \infty$ whenever $n^{3/2} r_n^d \to +\infty$ and $r_n = o(1)$. The conditions allow very sparse graphs since when +$n^{3/2} r_n^d \to 0$, the graph contains isolated edges only, with high +probability. We also show that, without any condition on the density, a +consistent estimator of $d$ exists when $n r_n^d \to \infty$ and $r_n = o(1)$. + +
+
+
+
+
+ + ☆ Novel OCT mosaicking pipeline with Feature- and Pixel-based registration + + +
+ High-resolution Optical Coherence Tomography (OCT) images are crucial for +ophthalmology studies but are limited by their relatively narrow field of view +(FoV). Image mosaicking is a technique for aligning multiple overlapping images +to obtain a larger FoV. Current mosaicking pipelines often struggle with +substantial noise and considerable displacement between the input sub-fields. +In this paper, we propose a versatile pipeline for stitching multi-view +OCT/OCTA \textit{en face} projection images. Our method combines the strengths +of learning-based feature matching and robust pixel-based registration to align +multiple images effectively. Furthermore, we advance the application of a +trained foundational model, Segment Anything Model (SAM), to validate +mosaicking results in an unsupervised manner. The efficacy of our pipeline is +validated using an in-house dataset and a large public dataset, where our +method shows superior performance in terms of both accuracy and computational +efficiency. We also made our evaluation tool for image mosaicking and the +corresponding pipeline publicly available at +\url{https://github.com/MedICL-VU/OCT-mosaicking}. + +
+
+
+
+
+ + ☆ Multi-fidelity Bayesian Optimization in Engineering Design + + +
+ Resided at the intersection of multi-fidelity optimization (MFO) and Bayesian +optimization (BO), MF BO has found a niche in solving expensive engineering +design optimization problems, thanks to its advantages in incorporating +physical and mathematical understandings of the problems, saving resources, +addressing exploitation-exploration trade-off, considering uncertainty, and +processing parallel computing. The increasing number of works dedicated to MF +BO suggests the need for a comprehensive review of this advanced optimization +technique. In this paper, we survey recent developments of two essential +ingredients of MF BO: Gaussian process (GP) based MF surrogates and acquisition +functions. We first categorize the existing MF modeling methods and MFO +strategies to locate MF BO in a large family of surrogate-based optimization +and MFO algorithms. We then exploit the common properties shared between the +methods from each ingredient of MF BO to describe important GP-based MF +surrogate models and review various acquisition functions. By doing so, we +expect to provide a structured understanding of MF BO. Finally, we attempt to +reveal important aspects that require further research for applications of MF +BO in solving intricate yet important design optimization problems, including +constrained optimization, high-dimensional optimization, optimization under +uncertainty, and multi-objective optimization. + +
+
+
+
+
+ + ☆ Do we listen to what we are told? An empirical study on human behaviour + during the COVID-19 pandemic: neural networks vs. regression analysis + + +
+ In this work, we contribute the first visual open-source empirical study on +human behaviour during the COVID-19 pandemic, in order to investigate how +compliant a general population is to mask-wearing-related public-health policy. +Object-detection-based convolutional neural networks, regression analysis and +multilayer perceptrons are combined to analyse visual data of the Viennese +public during 2020. We find that mask-wearing-related government regulations +and public-transport announcements encouraged correct mask-wearing-behaviours +during the COVID-19 pandemic. Importantly, changes in announcement and +regulation contents led to heterogeneous effects on people's behaviour. +Comparing the predictive power of regression analysis and neural networks, we +demonstrate that the latter produces more accurate predictions of population +reactions during the COVID-19 pandemic. Our use of regression modelling also +allows us to unearth possible causal pathways underlying societal behaviour. +Since our findings highlight the importance of appropriate communication +contents, our results will facilitate more effective non-pharmaceutical +interventions to be developed in future. Adding to the literature, we +demonstrate that regression modelling and neural networks are not mutually +exclusive but instead complement each other. + +
+
+
+
+
+ + ☆ Synaptic Sampling of Neural Networks + + +
+ Probabilistic artificial neural networks offer intriguing prospects for +enabling the uncertainty of artificial intelligence methods to be described +explicitly in their function; however, the development of techniques that +quantify uncertainty by well-understood methods such as Monte Carlo sampling +has been limited by the high costs of stochastic sampling on deterministic +computing hardware. Emerging computing systems that are amenable to +hardware-level probabilistic computing, such as those that leverage stochastic +devices, may make probabilistic neural networks more feasible in the +not-too-distant future. This paper describes the scANN technique -- +\textit{sampling (by coinflips) artificial neural networks} -- which enables +neural networks to be sampled directly by treating the weights as Bernoulli +coin flips. This method is natively well suited for probabilistic computing +techniques that focus on tunable stochastic devices, nearly matches fully +deterministic performance while also describing the uncertainty of correct and +incorrect neural network outputs. + +
+
+ comment: 9 pages, accepted to 2023 IEEE International Conference on Rebooting + Computing +
+
+
+
+
+ + ☆ Favour: FAst Variance Operator for Uncertainty Rating + + +
+ Bayesian Neural Networks (BNN) have emerged as a crucial approach for +interpreting ML predictions. By sampling from the posterior distribution, data +scientists may estimate the uncertainty of an inference. Unfortunately many +inference samples are often needed, the overhead of which greatly hinder BNN's +wide adoption. To mitigate this, previous work proposed propagating the first +and second moments of the posterior directly through the network. However, on +its own this method is even slower than sampling, so the propagated variance +needs to be approximated such as assuming independence between neural nodes. +The resulting trade-off between quality and inference time did not match even +plain Monte Carlo sampling. + Our contribution is a more principled variance propagation framework based on +"spiked covariance matrices", which smoothly interpolates between quality and +inference time. This is made possible by a new fast algorithm for updating a +diagonal-plus-low-rank matrix approximation under various operations. We tested +our algorithm against sampling based MC Dropout and Variational Inference on a +number of downstream uncertainty themed tasks, such as calibration and +out-of-distribution testing. We find that Favour is as fast as performing 2-3 +inference samples, while matching the performance of 10-100 samples. + In summary, this work enables the use of BNN in the realm of performance +critical tasks where they have previously been out of reach. + +
+
+
+
+
+ + ☆ DMLR: Data-centric Machine Learning Research -- Past, Present and Future ICML 2023 + + +
+ Drawing from discussions at the inaugural DMLR workshop at ICML 2023 and +meetings prior, in this report we outline the relevance of community engagement +and infrastructure development for the creation of next-generation public +datasets that will advance machine learning science. We chart a path forward as +a collective effort to sustain the creation and maintenance of these datasets +and methods towards positive scientific, societal and business impact. + +
+
+ comment: This editorial report accompanies the inaugural Data-centric Machine + Learning Research (DMLR) Workshop that took place at ICML 2023 + https://dmlr.ai/ +
+
+
+
+
+ + ☆ Unsupervised Multimodal Surface Registration with Geometric Deep + Learning + + +
+ This paper introduces GeoMorph, a novel geometric deep-learning framework +designed for image registration of cortical surfaces. The registration process +consists of two main steps. First, independent feature extraction is performed +on each input surface using graph convolutions, generating low-dimensional +feature representations that capture important cortical surface +characteristics. Subsequently, features are registered in a deep-discrete +manner to optimize the overlap of common structures across surfaces by learning +displacements of a set of control points. To ensure smooth and biologically +plausible deformations, we implement regularization through a deep conditional +random field implemented with a recurrent neural network. Experimental results +demonstrate that GeoMorph surpasses existing deep-learning methods by achieving +improved alignment with smoother deformations. Furthermore, GeoMorph exhibits +competitive performance compared to classical frameworks. Such versatility and +robustness suggest strong potential for various neuroscience applications. + +
+
+
+
+
+ + ☆ Fast and Interpretable Mortality Risk Scores for Critical Care Patients + + +
+ Prediction of mortality in intensive care unit (ICU) patients is an important +task in critical care medicine. Prior work in creating mortality risk models +falls into two major categories: domain-expert-created scoring systems, and +black box machine learning (ML) models. Both of these have disadvantages: black +box models are unacceptable for use in hospitals, whereas manual creation of +models (including hand-tuning of logistic regression parameters) relies on +humans to perform high-dimensional constrained optimization, which leads to a +loss in performance. In this work, we bridge the gap between accurate black box +models and hand-tuned interpretable models. We build on modern interpretable ML +techniques to design accurate and interpretable mortality risk scores. We +leverage the largest existing public ICU monitoring datasets, namely the MIMIC +III and eICU datasets. By evaluating risk across medical centers, we are able +to study generalization across domains. In order to customize our risk score +models, we develop a new algorithm, GroupFasterRisk, which has several +important benefits: (1) it uses hard sparsity constraint, allowing users to +directly control the number of features; (2) it incorporates group sparsity to +allow more cohesive models; (3) it allows for monotonicity correction on models +for including domain knowledge; (4) it produces many equally-good models at +once, which allows domain experts to choose among them. GroupFasterRisk creates +its risk scores within hours, even on the large datasets we study here. +GroupFasterRisk's risk scores perform better than risk scores currently used in +hospitals, and have similar prediction performance to black box ML models +(despite being much sparser). Because GroupFasterRisk produces a variety of +risk scores and handles constraints, it allows design flexibility, which is the +key enabler of practical and trustworthy model creation. + +
+
+
+
+
+ + ♻ ☆ PyTorch Geometric Signed Directed: A Software Package on Graph Neural + Networks for Signed and Directed Graphs + + +
+ Networks are ubiquitous in many real-world applications (e.g., social +networks encoding trust/distrust relationships, correlation networks arising +from time series data). While many networks are signed or directed, or both, +there is a lack of unified software packages on graph neural networks (GNNs) +specially designed for signed and directed networks. In this paper, we present +PyTorch Geometric Signed Directed (PyGSD), a software package which fills this +gap. Along the way, we evaluate the implemented methods with experiments with a +view to providing insights into which method to choose for a given task. The +deep learning framework consists of easy-to-use GNN models, synthetic and +real-world data, as well as task-specific evaluation metrics and loss functions +for signed and directed networks. As an extension library for PyG, our proposed +software is maintained with open-source releases, detailed documentation, +continuous integration, unit tests and code coverage checks. The GitHub +repository of the library is +https://github.com/SherylHYX/pytorch_geometric_signed_directed. + +
+
+ comment: Accepted by LoG 2023. 27 pages in total +
+
+
+
+
+ + ♻ ☆ Offline Imitation from Observation via Primal Wasserstein State + Occupancy Matching NeurIPS 2023 + + +
+ In real-world scenarios, arbitrary interactions with the environment can +often be costly, and actions of expert demonstrations are not always available. +To reduce the need for both, Offline Learning from Observations (LfO) is +extensively studied, where the agent learns to solve a task with only expert +states and \textit{task-agnostic} non-expert state-action pairs. The +state-of-the-art DIstribution Correction Estimation (DICE) methods minimize the +state occupancy divergence between the learner and expert policies. However, +they are limited to either $f$-divergences (KL and $\chi^2$) or Wasserstein +distance with Rubinstein duality, the latter of which constrains the underlying +distance metric crucial to the performance of Wasserstein-based solutions. To +address this problem, we propose Primal Wasserstein DICE (PW-DICE), which +minimizes the primal Wasserstein distance between the expert and learner state +occupancies with a pessimistic regularizer and leverages a contrastively +learned distance as the underlying metric for the Wasserstein distance. +Theoretically, we prove that our framework is a generalization of the +state-of-the-art, SMODICE, and unifies $f$-divergence and Wasserstein +minimization. Empirically, we find that PW-DICE improves upon several +state-of-the-art methods on multiple testbeds. + +
+
+ comment: 23 pages. Accepted to the Optimal Transport and Machine Learning + Workshop at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Topological properties of basins of attraction and expressiveness of + width bounded neural networks + + +
+ In Radhakrishnan et al. [2020], the authors empirically show that +autoencoders trained with usual SGD methods shape out basins of attraction +around their training data. We consider network functions of width not +exceeding the input dimension and prove that in this situation basins of +attraction are bounded and their complement cannot have bounded components. Our +conditions in these results are met in several experiments of the latter work +and we thus address a question posed therein. We also show that under some more +restrictive conditions the basins of attraction are path-connected. The +tightness of the conditions in our results is demonstrated by means of several +examples. Finally, the arguments used to prove the above results allow us to +derive a root cause why scalar-valued neural network functions that fulfill our +bounded width condition are not dense in spaces of continuous functions. + +
+
+
+
+
+ + ♻ ☆ Task Arithmetic in the Tangent Space: Improved Editing of Pre-Trained + Models + + +
+ Task arithmetic has recently emerged as a cost-effective and scalable +approach to edit pre-trained models directly in weight space: By adding the +fine-tuned weights of different tasks, the model's performance can be improved +on these tasks, while negating them leads to task forgetting. Yet, our +understanding of the effectiveness of task arithmetic and its underlying +principles remains limited. We present a comprehensive study of task arithmetic +in vision-language models and show that weight disentanglement is the crucial +factor that makes it effective. This property arises during pre-training and +manifests when distinct directions in weight space govern separate, localized +regions in function space associated with the tasks. Notably, we show that +fine-tuning models in their tangent space by linearizing them amplifies weight +disentanglement. This leads to substantial performance improvements across +multiple task arithmetic benchmarks and diverse models. Building on these +findings, we provide theoretical and empirical analyses of the neural tangent +kernel (NTK) of these models and establish a compelling link between task +arithmetic and the spatial localization of the NTK eigenfunctions. Overall, our +work uncovers novel insights into the fundamental mechanisms of task arithmetic +and offers a more reliable and effective approach to edit pre-trained models +through the NTK linearization. + +
+
+
+
+
+ + ♻ ☆ Banach-Tarski Embeddings and Transformers + + +
+ We introduce a new construction of embeddings of arbitrary recursive data +structures into high dimensional vectors. These embeddings provide an +interpretable model for the latent state vectors of transformers. We +demonstrate that these embeddings can be decoded to the original data structure +when the embedding dimension is sufficiently large. This decoding algorithm has +a natural implementation as a transformer. We also show that these embedding +vectors can be manipulated directly to perform computations on the underlying +data without decoding. As an example we present an algorithm that constructs +the embedded parse tree of an embedded token sequence using only vector +operations in embedding space. + +
+
+ comment: 22 pages, 7 figures. v2: Fixed order of matrix multiplication in + section 2.4 +
+
+
+
+
+ + ♻ ☆ Compressive Fourier collocation methods for high-dimensional diffusion + equations with periodic boundary conditions + + +
+ High-dimensional Partial Differential Equations (PDEs) are a popular +mathematical modelling tool, with applications ranging from finance to +computational chemistry. However, standard numerical techniques for solving +these PDEs are typically affected by the curse of dimensionality. In this work, +we tackle this challenge while focusing on stationary diffusion equations +defined over a high-dimensional domain with periodic boundary conditions. +Inspired by recent progress in sparse function approximation in high +dimensions, we propose a new method called compressive Fourier collocation. +Combining ideas from compressive sensing and spectral collocation, our method +replaces the use of structured collocation grids with Monte Carlo sampling and +employs sparse recovery techniques, such as orthogonal matching pursuit and +$\ell^1$ minimization, to approximate the Fourier coefficients of the PDE +solution. We conduct a rigorous theoretical analysis showing that the +approximation error of the proposed method is comparable with the best $s$-term +approximation (with respect to the Fourier basis) to the solution. Using the +recently introduced framework of random sampling in bounded Riesz systems, our +analysis shows that the compressive Fourier collocation method mitigates the +curse of dimensionality with respect to the number of collocation points under +sufficient conditions on the regularity of the diffusion coefficient. We also +present numerical experiments that illustrate the accuracy and stability of the +method for the approximation of sparse and compressible solutions. + +
+
+ comment: 34 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Editing Personality for LLMs + + +
+ This paper introduces an innovative task focused on editing the personality +traits of Large Language Models (LLMs). This task seeks to adjust the models' +responses to opinion-related questions on specified topics since an +individual's personality often manifests in the form of their expressed +opinions, thereby showcasing different personality traits. Specifically, we +construct a new benchmark dataset PersonalityEdit to address this task. Drawing +on the theory in Social Psychology, we isolate three representative traits, +namely Neuroticism, Extraversion, and Agreeableness, as the foundation for our +benchmark. We then gather data using GPT-4, generating responses that not only +align with a specified topic but also embody the targeted personality trait. We +conduct comprehensive experiments involving various baselines and discuss the +representation of personality behavior in LLMs. Our intriguing findings uncover +potential challenges of the proposed task, illustrating several remaining +issues. We anticipate that our work can provide the NLP community with +insights. Code and datasets will be released at +https://github.com/zjunlp/EasyEdit. + +
+
+ comment: Work in progress, add more experiments +
+
+
+
+
+ + ♻ ☆ Unveiling the Pitfalls of Knowledge Editing for Large Language Models + + +
+ As the cost associated with fine-tuning Large Language Models (LLMs) +continues to rise, recent research efforts have pivoted towards developing +methodologies to edit implicit knowledge embedded within LLMs. Yet, there's +still a dark cloud lingering overhead -- will knowledge editing trigger +butterfly effect? since it is still unclear whether knowledge editing might +introduce side effects that pose potential risks or not. This paper pioneers +the investigation into the potential pitfalls associated with knowledge editing +for LLMs. To achieve this, we introduce new benchmark datasets and propose +innovative evaluation metrics. Our results underline two pivotal concerns: (1) +Knowledge Conflict: Editing groups of facts that logically clash can magnify +the inherent inconsistencies in LLMs-a facet neglected by previous methods. (2) +Knowledge Distortion: Altering parameters with the aim of editing factual +knowledge can irrevocably warp the innate knowledge structure of LLMs. +Experimental results vividly demonstrate that knowledge editing might +inadvertently cast a shadow of unintended consequences on LLMs, which warrant +attention and efforts for future works. Code is available at +https://github.com/zjunlp/PitfallsKnowledgeEditing. + +
+
+ comment: Work in progress, add more experiments +
+
+
+
+
+ + ♻ ☆ An Automated Pipeline for Tumour-Infiltrating Lymphocyte Scoring in + Breast Cancer + + +
+ Tumour-infiltrating lymphocytes (TILs) are considered as a valuable +prognostic markers in both triple-negative and human epidermal growth factor +receptor 2 (HER2) positive breast cancer. In this study, we introduce an +innovative deep learning pipeline based on the Efficient-UNet architecture to +predict the TILs score for breast cancer whole-slide images (WSIs). We first +segment tumour and stromal regions in order to compute a tumour bulk mask. We +then detect TILs within the tumour-associated stroma, generating a TILs score +by closely mirroring the pathologist's workflow. Our method exhibits +state-of-the-art performance in segmenting tumour/stroma areas and TILs +detection, as demonstrated by internal cross-validation on the TiGER Challenge +training dataset and evaluation on the final leaderboards. Additionally, our +TILs score proves competitive in predicting survival outcomes within the same +challenge, underscoring the clinical relevance and potential of our automated +TILs scoring pipeline as a breast cancer prognostic tool. + +
+
+ comment: 5 pages, 1 figure, 2 tables +
+
+
+
+
+ + ♻ ☆ Stable Adam Optimization for 16-bit Neural Networks Training + + +
+ In this research, we address critical concerns related to the numerical +instability observed in 16-bit computations of machine learning models. Such +instability, particularly when employing popular optimization algorithms like +Adam, often leads to unstable training of deep neural networks. This not only +disrupts the learning process but also poses significant challenges in +deploying dependable models in real-world applications. Our investigation +identifies the epsilon hyperparameter as the primary source of this +instability. A nuanced exploration reveals that subtle adjustments to epsilon +within 16-bit computations can enhance the numerical stability of Adam, +enabling more stable training of 16-bit neural networks. We propose a novel, +dependable approach that leverages updates from the Adam optimizer to bolster +the stability of the learning process. Our contributions provide deeper +insights into optimization challenges in low-precision computations and offer +solutions to ensure the stability of deep neural network training, paving the +way for their dependable use in various applications. + +
+
+
+
+
+ + ♻ ☆ BrainWash: A Poisoning Attack to Forget in Continual Learning + + +
+ Continual learning has gained substantial attention within the deep learning +community, offering promising solutions to the challenging problem of +sequential learning. Yet, a largely unexplored facet of this paradigm is its +susceptibility to adversarial attacks, especially with the aim of inducing +forgetting. In this paper, we introduce "BrainWash," a novel data poisoning +method tailored to impose forgetting on a continual learner. By adding the +BrainWash noise to a variety of baselines, we demonstrate how a trained +continual learner can be induced to forget its previously learned tasks +catastrophically, even when using these continual learning baselines. An +important feature of our approach is that the attacker requires no access to +previous tasks' data and is armed merely with the model's current parameters +and the data belonging to the most recent task. Our extensive experiments +highlight the efficacy of BrainWash, showcasing degradation in performance +across various regularization-based continual learning methods. + +
+
+
+
+
+ + ♻ ☆ Leveraging High-Level Synthesis and Large Language Models to Generate, + Simulate, and Deploy a Uniform Random Number Generator Hardware Design + + +
+ We present a new high-level synthesis methodology for using large language +model tools to generate hardware designs. The methodology uses exclusively +open-source tools excluding the large language model. As a case study, we use +our methodology to generate a permuted congruential random number generator +design with a wishbone interface. We verify the functionality and quality of +the random number generator design using large language model-generated +simulations and the Dieharder randomness test suite. We document all the large +language model chat logs, Python scripts, Verilog scripts, and simulation +results used in the case study. We believe that our method of hardware design +generation coupled with the open source silicon 130 nm design tools will +revolutionize application-specific integrated circuit design. Our methodology +significantly lowers the bar to entry when building domain-specific computing +accelerators for the Internet of Things and proof of concept prototypes for +later fabrication in more modern process nodes. + +
+
+
+
+
+ + ♻ ☆ Multi-Objective Optimization Using the R2 Utility + + +
+ The goal of multi-objective optimization is to identify a collection of +points which describe the best possible trade-offs between the multiple +objectives. In order to solve this vector-valued optimization problem, +practitioners often appeal to the use of scalarization functions in order to +transform the multi-objective problem into a collection of single-objective +problems. This set of scalarized problems can then be solved using traditional +single-objective optimization techniques. In this work, we formalise this +convention into a general mathematical framework. We show how this strategy +effectively recasts the original multi-objective optimization problem into a +single-objective optimization problem defined over sets. An appropriate class +of objective functions for this new problem is the R2 utility function, which +is defined as a weighted integral over the scalarized optimization problems. We +show that this utility function is a monotone and submodular set function, +which can be optimised effectively using greedy optimization algorithms. We +analyse the performance of these greedy algorithms both theoretically and +empirically. Our analysis largely focusses on Bayesian optimization, which is a +popular probabilistic framework for black-box optimization. + +
+
+ comment: The code is available at: https://github.com/benmltu/scalarize +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning with Maskable Stock Representation for Portfolio + Management in Customizable Stock Pools + + +
+ Portfolio management (PM) is a fundamental financial trading task, which +explores the optimal periodical reallocation of capitals into different stocks +to pursue long-term profits. Reinforcement learning (RL) has recently shown its +potential to train profitable agents for PM through interacting with financial +markets. However, existing work mostly focuses on fixed stock pools, which is +inconsistent with investors' practical demand. Specifically, the target stock +pool of different investors varies dramatically due to their discrepancy on +market states and individual investors may temporally adjust stocks they desire +to trade (e.g., adding one popular stocks), which lead to customizable stock +pools (CSPs). Existing RL methods require to retrain RL agents even with a tiny +change of the stock pool, which leads to high computational cost and unstable +performance. To tackle this challenge, we propose EarnMore, a rEinforcement +leARNing framework with Maskable stOck REpresentation to handle PM with CSPs +through one-shot training in a global stock pool (GSP). Specifically, we first +introduce a mechanism to mask out the representation of the stocks outside the +target pool. Second, we learn meaningful stock representations through a +self-supervised masking and reconstruction process. Third, a re-weighting +mechanism is designed to make the portfolio concentrate on favorable stocks and +neglect the stocks outside the target pool. Through extensive experiments on 8 +subset stock pools of the US stock market, we demonstrate that EarnMore +significantly outperforms 14 state-of-the-art baselines in terms of 6 popular +financial metrics with over 40% improvement on profit. + +
+
+
+
+
+ + ♻ ☆ The Clock and the Pizza: Two Stories in Mechanistic Explanation of + Neural Networks NeurIPS 2023 + + +
+ Do neural networks, trained on well-understood algorithmic tasks, reliably +rediscover known algorithms for solving those tasks? Several recent studies, on +tasks ranging from group arithmetic to in-context linear regression, have +suggested that the answer is yes. Using modular addition as a prototypical +problem, we show that algorithm discovery in neural networks is sometimes more +complex. Small changes to model hyperparameters and initializations can induce +the discovery of qualitatively different algorithms from a fixed training set, +and even parallel implementations of multiple such algorithms. Some networks +trained to perform modular addition implement a familiar Clock algorithm; +others implement a previously undescribed, less intuitive, but comprehensible +procedure which we term the Pizza algorithm, or a variety of even more complex +procedures. Our results show that even simple learning problems can admit a +surprising diversity of solutions, motivating the development of new tools for +characterizing the behavior of neural networks across their algorithmic phase +space. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Relphormer: Relational Graph Transformer for Knowledge Graph + Representations + + +
+ Transformers have achieved remarkable performance in widespread fields, +including natural language processing, computer vision and graph mining. +However, vanilla Transformer architectures have not yielded promising +improvements in the Knowledge Graph (KG) representations, where the +translational distance paradigm dominates this area. Note that vanilla +Transformer architectures struggle to capture the intrinsically heterogeneous +structural and semantic information of knowledge graphs. To this end, we +propose a new variant of Transformer for knowledge graph representations dubbed +Relphormer. Specifically, we introduce Triple2Seq which can dynamically sample +contextualized sub-graph sequences as the input to alleviate the heterogeneity +issue. We propose a novel structure-enhanced self-attention mechanism to encode +the relational information and keep the semantic information within entities +and relations. Moreover, we utilize masked knowledge modeling for general +knowledge graph representation learning, which can be applied to various +KG-based tasks including knowledge graph completion, question answering, and +recommendation. Experimental results on six datasets show that Relphormer can +obtain better performance compared with baselines. Code is available in +https://github.com/zjunlp/Relphormer. + +
+
+ comment: Neurocomputing 2023 +
+
+
+
+
+ + ♻ ☆ Machine-learning-accelerated simulations to enable automatic surface + reconstruction + + +
+ Understanding material surfaces and interfaces is vital in applications like +catalysis or electronics. By combining energies from electronic structure with +statistical mechanics, ab initio simulations can in principle predict the +structure of material surfaces as a function of thermodynamic variables. +However, accurate energy simulations are prohibitive when coupled to the vast +phase space that must be statistically sampled. Here, we present a bi-faceted +computational loop to predict surface phase diagrams of multi-component +materials that accelerates both the energy scoring and statistical sampling +methods. Fast, scalable, and data-efficient machine learning interatomic +potentials are trained on high-throughput density-functional theory +calculations through closed-loop active learning. Markov-chain Monte Carlo +sampling in the semi-grand canonical ensemble is enabled by using virtual +surface sites. The predicted surfaces for GaN(0001), Si(111), and SrTiO3(001) +are in agreement with past work and suggest that the proposed strategy can +model complex material surfaces and discover previously unreported surface +terminations. + +
+
+ comment: 30 pages main, 15 figures/tables, 5 pages supplementary +
+
+
+
+
+ + ♻ ☆ Differentially Private Optimizers Can Learn Adversarially Robust Models + + +
+ Machine learning models have shone in a variety of domains and attracted +increasing attention from both the security and the privacy communities. One +important yet worrying question is: Will training models under the differential +privacy (DP) constraint have an unfavorable impact on their adversarial +robustness? While previous works have postulated that privacy comes at the cost +of worse robustness, we give the first theoretical analysis to show that DP +models can indeed be robust and accurate, even sometimes more robust than their +naturally-trained non-private counterparts. We observe three key factors that +influence the privacy-robustness-accuracy tradeoff: (1) hyper-parameters for DP +optimizers are critical; (2) pre-training on public data significantly +mitigates the accuracy and robustness drop; (3) choice of DP optimizers makes a +difference. With these factors set properly, we achieve 90\% natural accuracy, +72\% robust accuracy ($+9\%$ than the non-private model) under $l_2(0.5)$ +attack, and 69\% robust accuracy ($+16\%$ than the non-private model) with +pre-trained SimCLRv2 model under $l_\infty(4/255)$ attack on CIFAR10 with +$\epsilon=2$. In fact, we show both theoretically and empirically that DP +models are Pareto optimal on the accuracy-robustness tradeoff. Empirically, the +robustness of DP models is consistently observed across various datasets and +models. We believe our encouraging results are a significant step towards +training models that are private as well as robust. + +
+
+
+
+
+ + ♻ ☆ Low Dimensional Invariant Embeddings for Universal Geometric Learning + + +
+ This paper studies separating invariants: mappings on $D$ dimensional domains +which are invariant to an appropriate group action, and which separate orbits. +The motivation for this study comes from the usefulness of separating +invariants in proving universality of equivariant neural network architectures. + We observe that in several cases the cardinality of separating invariants +proposed in the machine learning literature is much larger than the dimension +$D$. As a result, the theoretical universal constructions based on these +separating invariants is unrealistically large. Our goal in this paper is to +resolve this issue. + We show that when a continuous family of semi-algebraic separating invariants +is available, separation can be obtained by randomly selecting $2D+1 $ of these +invariants. We apply this methodology to obtain an efficient scheme for +computing separating invariants for several classical group actions which have +been studied in the invariant learning literature. Examples include matrix +multiplication actions on point clouds by permutations, rotations, and various +other linear groups. + Often the requirement of invariant separation is relaxed and only generic +separation is required. In this case, we show that only $D+1$ invariants are +required. More importantly, generic invariants are often significantly easier +to compute, as we illustrate by discussing generic and full separation for +weighted graphs. Finally we outline an approach for proving that separating +invariants can be constructed also when the random parameters have finite +precision. + +
+
+
+
+
+ + ♻ ☆ Pairing-based graph neural network for simulating quantum materials + + +
+ We develop a pairing-based graph neural network for simulating quantum +many-body systems. Our architecture augments a BCS-type geminal wavefunction +with a generalized pair amplitude parameterized by a graph neural network. +Variational Monte Carlo with our neural network simultaneously provides an +accurate, flexible, and scalable method for simulating many-electron systems. +We apply this method to two-dimensional semiconductor electron-hole bilayers +and obtain accurate results on a variety of interaction-induced phases, +including the exciton Bose-Einstein condensate, electron-hole superconductor, +and bilayer Wigner crystal. Our study demonstrates the potential of +physically-motivated neural network wavefunctions for quantum materials +simulations. + +
+
+
+
+
+ + ♻ ☆ survex: an R package for explaining machine learning survival models + + +
+ Due to their flexibility and superior performance, machine learning models +frequently complement and outperform traditional statistical survival models. +However, their widespread adoption is hindered by a lack of user-friendly tools +to explain their internal operations and prediction rationales. To tackle this +issue, we introduce the survex R package, which provides a cohesive framework +for explaining any survival model by applying explainable artificial +intelligence techniques. The capabilities of the proposed software encompass +understanding and diagnosing survival models, which can lead to their +improvement. By revealing insights into the decision-making process, such as +variable effects and importances, survex enables the assessment of model +reliability and the detection of biases. Thus, transparency and responsibility +may be promoted in sensitive areas, such as biomedical research and healthcare +applications. + +
+
+
+
+
+ + ♻ ☆ Influencer Videos: Unboxing the Mystique + + +
+ Influencer marketing has become a very popular tool to reach customers. +Despite the rapid growth in influencer videos, there has been little research +on the effectiveness of their constituent features in explaining video +engagement. We study YouTube influencers and analyze their unstructured video +data across text, audio and images using an "interpretable deep learning" +framework that accomplishes both goals of prediction and interpretation. Our +prediction-based approach analyzes unstructured data and finds that "what is +said" in words (text) is more influential than "how it is said" in imagery +(images) or acoustics (audio). Our novel interpretation-based approach is +implemented after completion of model prediction by analyzing the same source +of unstructured data to measure importance attributed to the video features. We +eliminate several spurious relationships in two steps, identifying a subset of +relationships which are confirmed using theory. We uncover novel findings that +establish distinct associations for measures of shallow and deep engagement +based on the dual-system framework of human thinking. Our approach is validated +using simulated data, and we discuss the learnings from our findings for +influencers and brands. + +
+
+ comment: 45 pages, Online Appendix +
+
+
+
+
+ + ♻ ☆ Shortcut Learning in Deep Neural Networks + + +
+ Deep learning has triggered the current rise of artificial intelligence and +is the workhorse of today's machine intelligence. Numerous success stories have +rapidly spread all over science, industry and society, but its limitations have +only recently come into focus. In this perspective we seek to distill how many +of deep learning's problems can be seen as different symptoms of the same +underlying problem: shortcut learning. Shortcuts are decision rules that +perform well on standard benchmarks but fail to transfer to more challenging +testing conditions, such as real-world scenarios. Related issues are known in +Comparative Psychology, Education and Linguistics, suggesting that shortcut +learning may be a common characteristic of learning systems, biological and +artificial alike. Based on these observations, we develop a set of +recommendations for model interpretation and benchmarking, highlighting recent +advances in machine learning to improve robustness and transferability from the +lab to real-world applications. + +
+
+ comment: perspective article published at Nature Machine Intelligence + (https://doi.org/10.1038/s42256-020-00257-z) +
+
+
+
+
+ + ♻ ☆ Continual Learning: Applications and the Road Forward + + +
+ Continual learning is a sub-field of machine learning, which aims to allow +machine learning models to continuously learn on new data, by accumulating +knowledge without forgetting what was learned in the past. In this work, we +take a step back, and ask: "Why should one care about continual learning in the +first place?". We set the stage by surveying recent continual learning papers +published at three major machine learning conferences, and show that +memory-constrained settings dominate the field. Then, we discuss five open +problems in machine learning, and even though they seem unrelated to continual +learning at first sight, we show that continual learning will inevitably be +part of their solution. These problems are model-editing, personalization, +on-device learning, faster (re-)training and reinforcement learning. Finally, +by comparing the desiderata from these unsolved problems and the current +assumptions in continual learning, we highlight and discuss four future +directions for continual learning research. We hope that this work offers an +interesting perspective on the future of continual learning, while displaying +its potential value and the paths we have to pursue in order to make it +successful. This work is the result of the many discussions the authors had at +the Dagstuhl seminar on Deep Continual Learning, in March 2023. + +
+
+
+
+
+ + ♻ ☆ Multi-channel Speech Separation Using Spatially Selective Deep + Non-linear Filters + + +
+ In a multi-channel separation task with multiple speakers, we aim to recover +all individual speech signals from the mixture. In contrast to single-channel +approaches, which rely on the different spectro-temporal characteristics of the +speech signals, multi-channel approaches should additionally utilize the +different spatial locations of the sources for a more powerful separation +especially when the number of sources increases. To enhance the spatial +processing in a multi-channel source separation scenario, in this work, we +propose a deep neural network (DNN) based spatially selective filter (SSF) that +can be spatially steered to extract the speaker of interest by initializing a +recurrent neural network layer with the target direction. We compare the +proposed SSF with a common end-to-end direct separation (DS) approach trained +using utterance-wise permutation invariant training (PIT), which only +implicitly learns to perform spatial filtering. We show that the SSF has a +clear advantage over a DS approach with the same underlying network +architecture when there are more than two speakers in the mixture, which can be +attributed to a better use of the spatial information. Furthermore, we find +that the SSF generalizes much better to additional noise sources that were not +seen during training and to scenarios with speakers positioned at a similar +angle. + +
+
+ comment: Accepted version +
+
+
+
+
+ + ♻ ☆ Attending to Graph Transformers + + +
+ Recently, transformer architectures for graphs emerged as an alternative to +established techniques for machine learning with graphs, such as +(message-passing) graph neural networks. So far, they have shown promising +empirical results, e.g., on molecular prediction datasets, often attributed to +their ability to circumvent graph neural networks' shortcomings, such as +over-smoothing and over-squashing. Here, we derive a taxonomy of graph +transformer architectures, bringing some order to this emerging field. We +overview their theoretical properties, survey structural and positional +encodings, and discuss extensions for important graph classes, e.g., 3D +molecular graphs. Empirically, we probe how well graph transformers can recover +various graph properties, how well they can deal with heterophilic graphs, and +to what extent they prevent over-squashing. Further, we outline open challenges +and research direction to stimulate future work. Our code is available at +https://github.com/luis-mueller/probing-graph-transformers. + +
+
+
+
+
+ + ♻ ☆ Computing Approximate $\ell_p$ Sensitivities + + +
+ Recent works in dimensionality reduction for regression tasks have introduced +the notion of sensitivity, an estimate of the importance of a specific +datapoint in a dataset, offering provable guarantees on the quality of the +approximation after removing low-sensitivity datapoints via subsampling. +However, fast algorithms for approximating $\ell_p$ sensitivities, which we +show is equivalent to approximate $\ell_p$ regression, are known for only the +$\ell_2$ setting, in which they are termed leverage scores. + In this work, we provide efficient algorithms for approximating $\ell_p$ +sensitivities and related summary statistics of a given matrix. In particular, +for a given $n \times d$ matrix, we compute $\alpha$-approximation to its +$\ell_1$ sensitivities at the cost of $O(n/\alpha)$ sensitivity computations. +For estimating the total $\ell_p$ sensitivity (i.e. the sum of $\ell_p$ +sensitivities), we provide an algorithm based on importance sampling of +$\ell_p$ Lewis weights, which computes a constant factor approximation to the +total sensitivity at the cost of roughly $O(\sqrt{d})$ sensitivity +computations. Furthermore, we estimate the maximum $\ell_1$ sensitivity, up to +a $\sqrt{d}$ factor, using $O(d)$ sensitivity computations. We generalize all +these results to $\ell_p$ norms for $p > 1$. Lastly, we experimentally show +that for a wide class of matrices in real-world datasets, the total sensitivity +can be quickly approximated and is significantly smaller than the theoretical +prediction, demonstrating that real-world datasets have low intrinsic effective +dimensionality. + +
+
+
+
+
+ + ♻ ☆ Sample Efficient Reward Augmentation in offline-to-online Reinforcement + Learning + + +
+ Offline-to-online RL can make full use of pre-collected offline datasets to +initialize policies, resulting in higher sample efficiency and better +performance compared to only using online algorithms alone for policy training. +However, direct fine-tuning of the pre-trained policy tends to result in +sub-optimal performance. A primary reason is that conservative offline RL +methods diminish the agent's capability of exploration, thereby impacting +online fine-tuning performance. To encourage agent's exploration during online +fine-tuning and enhance the overall online fine-tuning performance, we propose +a generalized reward augmentation method called Sample Efficient Reward +Augmentation (SERA). Specifically, SERA encourages agent to explore by +computing Q conditioned entropy as intrinsic reward. The advantage of SERA is +that it can extensively utilize offline pre-trained Q to encourage agent +uniformly coverage of state space while considering the imbalance between the +distributions of high-value and low-value states. Additionally, SERA can be +effortlessly plugged into various RL algorithms to improve online fine-tuning +and ensure sustained asymptotic improvement. Moreover, extensive experimental +results demonstrate that when conducting offline-to-online problems, SERA +consistently and effectively enhances the performance of various offline +algorithms. + +
+
+ comment: 23 pages, 11 Figures, and 6 Tables +
+
+
+
+
+ + ♻ ☆ How Can We Train Deep Learning Models Across Clouds and Continents? An + Experimental Study + + +
+ Training deep learning models in the cloud or on dedicated hardware is +expensive. A more cost-efficient option are hyperscale clouds offering spot +instances, a cheap but ephemeral alternative to on-demand resources. As spot +instance availability can change depending on the time of day, continent, and +cloud provider, it could be more cost-efficient to distribute resources over +the world. Still, it has not been investigated whether geo-distributed, +data-parallel spot deep learning training could be a more cost-efficient +alternative to centralized training. + This paper aims to answer the question: Can deep learning models be +cost-efficiently trained on a global market of spot VMs spanning different data +centers and cloud providers? To provide guidance, we extensively evaluate the +cost and throughput implications of training in different zones, continents, +and clouds for representative CV, NLP and ASR models. To expand the current +training options further, we compare the scalability potential for hybrid-cloud +scenarios by adding cloud resources to on-premise hardware to improve training +throughput. Finally, we show how leveraging spot instance pricing enables a new +cost-efficient way to train models with multiple cheap VMs, trumping both more +centralized and powerful hardware and even on-demand cloud offerings at +competitive prices. + +
+
+ comment: Currently in review. Artifacts and Code: + https://github.com/cirquit/hivemind-multi-cloud +
+
+
+
+
+ + ♻ ☆ Personalized Federated Learning with Multi-branch Architecture IJCNN 2023 + + +
+ Federated learning (FL) is a decentralized machine learning technique that +enables multiple clients to collaboratively train models without requiring +clients to reveal their raw data to each other. Although traditional FL trains +a single global model with average performance among clients, statistical data +heterogeneity across clients has resulted in the development of personalized FL +(PFL), which trains personalized models with good performance on each client's +data. A key challenge with PFL is how to facilitate clients with similar data +to collaborate more in a situation where each client has data from complex +distribution and cannot determine one another's distribution. In this paper, we +propose a new PFL method (pFedMB) using multi-branch architecture, which +achieves personalization by splitting each layer of a neural network into +multiple branches and assigning client-specific weights to each branch. We also +design an aggregation method to improve the communication efficiency and the +model performance, with which each branch is globally updated with weighted +averaging by client-specific weights assigned to the branch. pFedMB is simple +but effective in facilitating each client to share knowledge with similar +clients by adjusting the weights assigned to each branch. We experimentally +show that pFedMB performs better than the state-of-the-art PFL methods using +the CIFAR10 and CIFAR100 datasets. + +
+
+ comment: Published at IJCNN 2023 +
+
+
+
+
+ + ♻ ☆ Approximating Two-Layer Feedforward Networks for Efficient Transformers EMNLP 2023 + + +
+ How to reduce compute and memory requirements of neural networks (NNs) +without sacrificing performance? Many recent works use sparse Mixtures of +Experts (MoEs) to build resource-efficient large language models (LMs). Here we +introduce several novel perspectives on MoEs, presenting a general framework +that unifies various methods to approximate two-layer NNs (e.g., feedforward +blocks of Transformers), including product-key memories (PKMs). Leveraging +insights from this framework, we propose methods to improve both MoEs and PKMs. +Unlike prior work that compares MoEs with dense baselines under the +compute-equal condition, our evaluation condition is parameter-equal, which is +crucial to properly evaluate LMs. We show that our MoEs are competitive with +the dense Transformer-XL on both the WikiText-103 and enwiki8 datasets at two +different scales, while being much more resource efficient. This demonstrates +that MoEs are relevant not only to extremely large LMs but also to any-scale +resource-efficient LMs. Our code is public. + +
+
+ comment: Accepted to EMNLP 2023 Findings +
+
+
+
+
+ + ♻ ☆ Layer-wise Auto-Weighting for Non-Stationary Test-Time Adaptation WACV 2024 + + +
+ Given the inevitability of domain shifts during inference in real-world +applications, test-time adaptation (TTA) is essential for model adaptation +after deployment. However, the real-world scenario of continuously changing +target distributions presents challenges including catastrophic forgetting and +error accumulation. Existing TTA methods for non-stationary domain shifts, +while effective, incur excessive computational load, making them impractical +for on-device settings. In this paper, we introduce a layer-wise auto-weighting +algorithm for continual and gradual TTA that autonomously identifies layers for +preservation or concentrated adaptation. By leveraging the Fisher Information +Matrix (FIM), we first design the learning weight to selectively focus on +layers associated with log-likelihood changes while preserving unrelated ones. +Then, we further propose an exponential min-max scaler to make certain layers +nearly frozen while mitigating outliers. This minimizes forgetting and error +accumulation, leading to efficient adaptation to non-stationary target +distribution. Experiments on CIFAR-10C, CIFAR-100C, and ImageNet-C show our +method outperforms conventional continual and gradual TTA approaches while +significantly reducing computational load, highlighting the importance of +FIM-based learning weight in adapting to continuously or gradually shifting +target domains. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ♻ ☆ Active Bird2Vec: Towards End-to-End Bird Sound Monitoring with + Transformers ECAI2023 + + +
+ We propose a shift towards end-to-end learning in bird sound monitoring by +combining self-supervised (SSL) and deep active learning (DAL). Leveraging +transformer models, we aim to bypass traditional spectrogram conversions, +enabling direct raw audio processing. ActiveBird2Vec is set to generate +high-quality bird sound representations through SSL, potentially accelerating +the assessment of environmental changes and decision-making processes for wind +farms. Additionally, we seek to utilize the wide variety of bird vocalizations +through DAL, reducing the reliance on extensively labeled datasets by human +experts. We plan to curate a comprehensive set of tasks through Huggingface +Datasets, enhancing future comparability and reproducibility of bioacoustic +research. A comparative analysis between various transformer models will be +conducted to evaluate their proficiency in bird sound recognition tasks. We aim +to accelerate the progression of avian bioacoustic research and contribute to +more effective conservation strategies. + +
+
+ comment: Accepted @AI4S ECAI2023. This is the author's version of the work +
+
+
+
+
+ + ♻ ☆ Heterogeneous Domain Adaptation with Positive and Unlabeled Data + + +
+ Heterogeneous unsupervised domain adaptation (HUDA) is the most challenging +domain adaptation setting where the feature spaces of source and target domains +are heterogeneous, and the target domain has only unlabeled data. Existing HUDA +methods assume that both positive and negative examples are available in the +source domain, which may not be satisfied in some real applications. This paper +addresses a new challenging setting called positive and unlabeled heterogeneous +unsupervised domain adaptation (PU-HUDA), a HUDA setting where the source +domain only has positives. PU-HUDA can also be viewed as an extension of PU +learning where the positive and unlabeled examples are sampled from different +domains. A naive combination of existing HUDA and PU learning methods is +ineffective in PU-HUDA due to the gap in label distribution between the source +and target domains. To overcome this issue, we propose a novel method, +predictive adversarial domain adaptation (PADA), which can predict likely +positive examples from the unlabeled target data and simultaneously align the +feature spaces to reduce the distribution divergence between the whole source +data and the likely positive target data. PADA achieves this by a unified +adversarial training framework for learning a classifier to predict positive +examples and a feature transformer to transform the target feature space to +that of the source. Specifically, they are both trained to fool a common +discriminator that determines whether the likely positive examples are from the +target or source domain. We experimentally show that PADA outperforms several +baseline methods, such as the naive combination of HUDA and PU learning. + +
+
+ comment: Accepted by IEEE Big Data 2023 as a regular paper +
+
+
+
+
+ + ♻ ☆ Exploring the Trie of Rules: a fast data structure for the + representation of association rules + + +
+ Association rule mining techniques can generate a large volume of sequential +data when implemented on transactional databases. Extracting insights from a +large set of association rules has been found to be a challenging process. When +examining a ruleset, the fundamental question is how to summarise and represent +meaningful mined knowledge efficiently. Many algorithms and strategies have +been developed to address issue of knowledge extraction; however, the +effectiveness of this process can be limited by the data structures. A better +data structure can sufficiently affect the speed of the knowledge extraction +process. This paper proposes a novel data structure, called the Trie of rules, +for storing a ruleset that is generated by association rule mining. The +resulting data structure is a prefix-tree graph structure made of pre-mined +rules. This graph stores the rules as paths within the prefix-tree in a way +that similar rules overlay each other. Each node in the tree represents a rule +where a consequent is this node, and an antecedent is a path from this node to +the root of the tree. The evaluation showed that the proposed representation +technique is promising. It compresses a ruleset with almost no data loss and +benefits in terms of time for basic operations such as searching for a specific +rule and sorting, which is the base for many knowledge discovery methods. +Moreover, our method demonstrated a significant improvement in traversing time, +achieving an 8-fold increase compared to traditional data structures. + +
+
+ comment: 12 pages, 13 figures, preprint of journal article +
+
+
+
+
+ + ♻ To Compress or Not to Compress- Self-Supervised Learning and Information + Theory: A Review + + +
+ Deep neural networks excel in supervised learning tasks but are constrained +by the need for extensive labeled data. Self-supervised learning emerges as a +promising alternative, allowing models to learn without explicit labels. +Information theory, and notably the information bottleneck principle, has been +pivotal in shaping deep neural networks. This principle focuses on optimizing +the trade-off between compression and preserving relevant information, +providing a foundation for efficient network design in supervised contexts. +However, its precise role and adaptation in self-supervised learning remain +unclear. In this work, we scrutinize various self-supervised learning +approaches from an information-theoretic perspective, introducing a unified +framework that encapsulates the \textit{self-supervised information-theoretic +learning problem}. We weave together existing research into a cohesive +narrative, delve into contemporary self-supervised methodologies, and spotlight +potential research avenues and inherent challenges. Additionally, we discuss +the empirical evaluation of information-theoretic quantities and their +estimation methods. Overall, this paper furnishes an exhaustive review of the +intersection of information theory, self-supervised learning, and deep neural +networks. + +
+
+
+
+
+ + ♻ ☆ Predict, Refine, Synthesize: Self-Guiding Diffusion Models for + Probabilistic Time Series Forecasting + + +
+ Diffusion models have achieved state-of-the-art performance in generative +modeling tasks across various domains. Prior works on time series diffusion +models have primarily focused on developing conditional models tailored to +specific forecasting or imputation tasks. In this work, we explore the +potential of task-agnostic, unconditional diffusion models for several time +series applications. We propose TSDiff, an unconditionally-trained diffusion +model for time series. Our proposed self-guidance mechanism enables +conditioning TSDiff for downstream tasks during inference, without requiring +auxiliary networks or altering the training procedure. We demonstrate the +effectiveness of our method on three different time series tasks: forecasting, +refinement, and synthetic data generation. First, we show that TSDiff is +competitive with several task-specific conditional forecasting methods +(predict). Second, we leverage the learned implicit probability density of +TSDiff to iteratively refine the predictions of base forecasters with reduced +computational overhead over reverse diffusion (refine). Notably, the generative +performance of the model remains intact -- downstream forecasters trained on +synthetic samples from TSDiff outperform forecasters that are trained on +samples from other state-of-the-art generative time series models, occasionally +even outperforming models trained on real data (synthesize). + +
+
+
+
+
+ + ♻ ☆ YolOOD: Utilizing Object Detection Concepts for Multi-Label + Out-of-Distribution Detection + + +
+ Out-of-distribution (OOD) detection has attracted a large amount of attention +from the machine learning research community in recent years due to its +importance in deployed systems. Most of the previous studies focused on the +detection of OOD samples in the multi-class classification task. However, OOD +detection in the multi-label classification task, a more common real-world use +case, remains an underexplored domain. In this research, we propose YolOOD - a +method that utilizes concepts from the object detection domain to perform OOD +detection in the multi-label classification task. Object detection models have +an inherent ability to distinguish between objects of interest +(in-distribution) and irrelevant objects (e.g., OOD objects) in images that +contain multiple objects belonging to different class categories. These +abilities allow us to convert a regular object detection model into an image +classifier with inherent OOD detection capabilities with just minor changes. We +compare our approach to state-of-the-art OOD detection methods and demonstrate +YolOOD's ability to outperform these methods on a comprehensive suite of +in-distribution and OOD benchmark datasets. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ LGL-BCI: A Lightweight Geometric Learning Framework for Motor + Imagery-Based Brain-Computer Interfaces + + +
+ Brain-Computer Interfaces (BCIs) are a groundbreaking technology for +interacting with external devices using brain signals. Despite advancements, +electroencephalogram (EEG)-based Motor Imagery (MI) tasks face challenges like +amplitude and phase variability, and complex spatial correlations, with a need +for smaller model size and faster inference. This study introduces the LGL-BCI +framework, employing a Geometric Deep Learning Framework for EEG processing in +non-Euclidean metric spaces, particularly the Symmetric Positive Definite (SPD) +Manifold space. LGL-BCI offers robust EEG data representation and captures +spatial correlations. We propose an EEG channel selection solution via a +feature decomposition algorithm to reduce SPD matrix dimensionality, with a +lossless transformation boosting inference speed. Extensive experiments show +LGL-BCI's superior accuracy and efficiency compared to current solutions, +highlighting geometric deep learning's potential in MI-BCI applications. The +efficiency, assessed on two public EEG datasets and two real-world EEG devices, +significantly outperforms the state-of-the-art solution in accuracy ($82.54\%$ +versus $62.22\%$) with fewer parameters (64.9M compared to 183.7M). + +
+
+
+
+
+ + ♻ ☆ An actor-critic algorithm with policy gradients to solve the job shop + scheduling problem using deep double recurrent agents + + +
+ There is a growing interest in integrating machine learning techniques and +optimization to solve challenging optimization problems. In this work, we +propose a deep reinforcement learning methodology for the job shop scheduling +problem (JSSP). The aim is to build up a greedy-like heuristic able to learn on +some distribution of JSSP instances, different in the number of jobs and +machines. The need for fast scheduling methods is well known, and it arises in +many areas, from transportation to healthcare. We model the JSSP as a Markov +Decision Process and then we exploit the efficacy of reinforcement learning to +solve the problem. We adopt an actor-critic scheme, where the action taken by +the agent is influenced by policy considerations on the state-value function. +The procedures are adapted to take into account the challenging nature of JSSP, +where the state and the action space change not only for every instance but +also after each decision. To tackle the variability in the number of jobs and +operations in the input, we modeled the agent using two incident LSTM models, a +special type of deep neural network. Experiments show the algorithm reaches +good solutions in a short time, proving that is possible to generate new greedy +heuristics just from learning-based methodologies. Benchmarks have been +generated in comparison with the commercial solver CPLEX. As expected, the +model can generalize, to some extent, to larger problems or instances +originated by a different distribution from the one used in training. + +
+
+
+
+
+ + ♻ ☆ Langevin dynamics based algorithm e-TH$\varepsilon$O POULA for + stochastic optimization problems with discontinuous stochastic gradient + + +
+ We introduce a new Langevin dynamics based algorithm, called +e-TH$\varepsilon$O POULA, to solve optimization problems with discontinuous +stochastic gradients which naturally appear in real-world applications such as +quantile estimation, vector quantization, CVaR minimization, and regularized +optimization problems involving ReLU neural networks. We demonstrate both +theoretically and numerically the applicability of the e-TH$\varepsilon$O POULA +algorithm. More precisely, under the conditions that the stochastic gradient is +locally Lipschitz in average and satisfies a certain convexity at infinity +condition, we establish non-asymptotic error bounds for e-TH$\varepsilon$O +POULA in Wasserstein distances and provide a non-asymptotic estimate for the +expected excess risk, which can be controlled to be arbitrarily small. Three +key applications in finance and insurance are provided, namely, multi-period +portfolio optimization, transfer learning in multi-period portfolio +optimization, and insurance claim prediction, which involve neural networks +with (Leaky)-ReLU activation functions. Numerical experiments conducted using +real-world datasets illustrate the superior empirical performance of +e-TH$\varepsilon$O POULA compared to SGLD, TUSLA, ADAM, and AMSGrad in terms of +model accuracy. + +
+
+
+
+
+ + ♻ ☆ Is TinyML Sustainable? Assessing the Environmental Impacts of Machine + Learning on Microcontrollers + + +
+ The sustained growth of carbon emissions and global waste elicits significant +sustainability concerns for our environment's future. The growing Internet of +Things (IoT) has the potential to exacerbate this issue. However, an emerging +area known as Tiny Machine Learning (TinyML) has the opportunity to help +address these environmental challenges through sustainable computing practices. +TinyML, the deployment of machine learning (ML) algorithms onto low-cost, +low-power microcontroller systems, enables on-device sensor analytics that +unlocks numerous always-on ML applications. This article discusses both the +potential of these TinyML applications to address critical sustainability +challenges, as well as the environmental footprint of this emerging technology. +Through a complete life cycle analysis (LCA), we find that TinyML systems +present opportunities to offset their carbon emissions by enabling applications +that reduce the emissions of other sectors. Nevertheless, when globally scaled, +the carbon footprint of TinyML systems is not negligible, necessitating that +designers factor in environmental impact when formulating new devices. Finally, +we outline research directions to enable further sustainable contributions of +TinyML. + +
+
+ comment: Communications of the ACM (CACM) November 2023 Issue +
+
+
+
+
+ + ♻ ☆ Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation + + +
+ One primary topic of multi-modal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multi-modal cooperation, which could not jointly +utilize all modalities well. Some methods are proposed to identify and enhance +the worse learnt modality, but are often hard to provide the fine-grained +observation of multi-modal cooperation at sample-level with theoretical +support. Hence, it is essential to reasonably observe and improve the +fine-grained cooperation between modalities, especially when facing realistic +scenarios where the modality discrepancy could vary across different samples. +To this end, we introduce a fine-grained modality valuation metric to evaluate +the contribution of each modality at sample-level. Via modality valuation, we +regretfully observe that the multi-modal model tends to rely on one specific +modality, resulting in other modalities being low-contributing. We further +analyze this issue and improve cooperation between modalities by enhancing the +discriminative ability of low-contributing modalities in a targeted manner. +Overall, our methods reasonably observe the fine-grained uni-modal contribution +at sample-level and achieve considerable improvement on different multi-modal +models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Bayesian Methods for Media Mix Modelling with shape and funnel effects + + +
+ In recent years, significant progress in generative AI has highlighted the +important role of physics-inspired models that utilize advanced mathematical +concepts based on fundamental physics principles to enhance artificial +intelligence capabilities. Among these models, those based on diffusion +equations have greatly improved image quality. This study aims to explore the +potential uses of Maxwell-Boltzmann equation, which forms the basis of the +kinetic theory of gases, and the Michaelis-Menten model in Marketing Mix +Modelling (MMM) applications. We propose incorporating these equations into +Hierarchical Bayesian models to analyse consumer behaviour in the context of +advertising. These equation sets excel in accurately describing the random +dynamics in complex systems like social interactions and consumer-advertising +interactions. + +
+
+
+
+
+ + ♻ ☆ Feature Engineering with Regularity Structures + + +
+ We investigate the use of models from the theory of regularity structures as +features in machine learning tasks. A model is a polynomial function of a +space-time signal designed to well-approximate solutions to partial +differential equations (PDEs), even in low regularity regimes. Models can be +seen as natural multi-dimensional generalisations of signatures of paths; our +work therefore aims to extend the recent use of signatures in data science +beyond the context of time-ordered data. We provide a flexible definition of a +model feature vector associated to a space-time signal, along with two +algorithms which illustrate ways in which these features can be combined with +linear regression. We apply these algorithms in several numerical experiments +designed to learn solutions to PDEs with a given forcing and boundary data. Our +experiments include semi-linear parabolic and wave equations with forcing, and +Burgers' equation with no forcing. We find an advantage in favour of our +algorithms when compared to several alternative methods. Additionally, in the +experiment with Burgers' equation, we find non-trivial predictive power when +noise is added to the observations. + +
+
+ comment: 33 pages, 7 figures, 7 tables. Improved presentation of model feature + vector (Section 2) and experiments (Section 3). Added new experiment in 2D + spatial domain (Section 3.1.2). To appear in Journal of Scientific Computing +
+
+
+
+
+ + ♻ ☆ PyDaddy: A Python package for discovering stochastic dynamical equations + from timeseries data + + +
+ Stochastic differential equations (SDEs) are an important framework to model +dynamics with randomness, as is common in most biological systems. The inverse +problem of integrating these models with empirical data remains a major +challenge. Here, we present a software package, PyDaDDy (Python Library for +Data Driven Dynamics) that takes time series data as an input and outputs an +interpretable SDE. We achieve this by combining traditional approaches from +stochastic calculus literature with state-of-the-art equation discovery +techniques. We validate our approach on synthetic datasets, and demonstrate the +generality and applicability of the method on two real-world datasets of vastly +different spatiotemporal scales: (i) collective movement of fish school where +stochasticity plays a crucial role, and (ii) confined migration of a single +cell, primarily following a relaxed oscillation. We make the method available +as an easy-to-use, open-source Python package, PyDaddy (Python Library for Data +Driven Dynamics). + +
+
+ comment: 15 pages (+ 9 page appendix), 6 figures (+ 8 appendix figures) +
+
+
+
+
+ + ♻ ☆ Composite Score for Anomaly Detection in Imbalanced Real-World + Industrial Dataset + + +
+ In recent years, the industrial sector has evolved towards its fourth +revolution. The quality control domain is particularly interested in advanced +machine learning for computer vision anomaly detection. Nevertheless, several +challenges have to be faced, including imbalanced datasets, the image +complexity, and the zero-false-negative (ZFN) constraint to guarantee the +high-quality requirement. This paper illustrates a use case for an industrial +partner, where Printed Circuit Board Assembly (PCBA) images are first +reconstructed with a Vector Quantized Generative Adversarial Network (VQGAN) +trained on normal products. Then, several multi-level metrics are extracted on +a few normal and abnormal images, highlighting anomalies through reconstruction +differences. Finally, a classifer is trained to build a composite anomaly score +thanks to the metrics extracted. This three-step approach is performed on the +public MVTec-AD datasets and on the partner PCBA dataset, where it achieves a +regular accuracy of 95.69% and 87.93% under the ZFN constraint. + +
+
+ comment: This version of the article has been accepted for publication, after + peer review and is subject to Springer Nature AM terms of use, but is not the + Version of Record and does not reflect post-acceptance improvements, or any + corrections. The Version of Record is available online at: + https://doi.org/10.1007/s10994-023-06415-9 +
+
+
+
+
+ + ♻ ☆ Unsupervised discovery of Interpretable Visual Concepts + + +
+ Providing interpretability of deep-learning models to non-experts, while +fundamental for a responsible real-world usage, is challenging. Attribution +maps from xAI techniques, such as Integrated Gradients, are a typical example +of a visualization technique containing a high level of information, but with +difficult interpretation. In this paper, we propose two methods, Maximum +Activation Groups Extraction (MAGE) and Multiscale Interpretable Visualization +(Ms-IV), to explain the model's decision, enhancing global interpretability. +MAGE finds, for a given CNN, combinations of features which, globally, form a +semantic meaning, that we call concepts. We group these similar feature +patterns by clustering in ``concepts'', that we visualize through Ms-IV. This +last method is inspired by Occlusion and Sensitivity analysis (incorporating +causality), and uses a novel metric, called Class-aware Order Correlation +(CaOC), to globally evaluate the most important image regions according to the +model's decision space. We compare our approach to xAI methods such as LIME and +Integrated Gradients. Experimental results evince the Ms-IV higher localization +and faithfulness values. Finally, qualitative evaluation of combined MAGE and +Ms-IV demonstrates humans' ability to agree, based on the visualization, with +the decision of clusters' concepts; and, to detect, among a given set of +networks, the existence of bias. + +
+
+
+
+
+ + ♻ ☆ Empirical Risk Minimization with Relative Entropy Regularization + + +
+ The empirical risk minimization (ERM) problem with relative entropy +regularization (ERM-RER) is investigated under the assumption that the +reference measure is a {\sigma}-finite measure, and not necessarily a +probability measure. Under this assumption, which leads to a generalization of +the ERM-RER problem allowing a larger degree of flexibility for incorporating +prior knowledge, numerous relevant properties are stated. Among these +properties, the solution to this problem, if it exists, is shown to be a unique +probability measure, often mutually absolutely continuous with the reference +measure. Such a solution exhibits a probably-approximately-correct guarantee +for the ERM problem independently of whether the latter possesses a solution. +For a fixed dataset, the empirical risk is shown to be a sub-Gaussian random +variable when the models are sampled from the solution to the ERM-RER problem. +The generalization capabilities of the solution to the ERM-RER problem (the +Gibbs algorithm) are studied via the sensitivity of the expected empirical risk +to deviations from such a solution towards alternative probability measures. +Finally, an interesting connection between sensitivity, generalization error, +and lautum information is established + +
+
+ comment: Submitted to the the Transactions on Information Theory on June 12, + 2023. Also available as: Research Report, INRIA, No. RR-9454, Centre Inria + d'Universit\'e C\^ote d'Azur, Sophia Antipolis, France, Feb., 2022 This + version contains the revision for Transactions on Information Theory on + November 21, 2023 +
+
+
+
+
+ + ♻ ☆ One-shot backpropagation for multi-step prediction in physics-based + system identification -- EXTENDED VERSION + + +
+ The aim of this paper is to present a novel physics-based framework for the +identification of dynamical systems, in which the physical and structural +insights are reflected directly into a backpropagation-based learning +algorithm. The main result is a method to compute in closed form the gradient +of a multi-step loss function, while enforcing physical properties and +constraints. The derived algorithm has been exploited to identify the unknown +inertia matrix of a space debris, and the results show the reliability of the +method in capturing the physical adherence of the estimated parameters. + +
+
+
+
+
+ + ♻ ☆ Personas as a Way to Model Truthfulness in Language Models + + +
+ Large Language Models (LLMs) are trained on vast amounts of text from the +internet, which contains both factual and misleading information about the +world. Can language models discern truth from falsehood in this contradicting +data? Expanding on the view that LLMs can model different communicative agents, +we present the persona hypothesis: LLMs can cluster agents into personas using +common features of their generations. For instance, a truthful persona is a +group of agents that are likely to produce truthful text and that share similar +features like formal writing styles and scientific references. By modeling this +persona, LLMs can generalize truthfulness beyond the specific contexts in which +each agent generated the training text. For example, the model can infer that +the agent ``Wikipedia'' will behave truthfully on topics that were only +generated by ``Science'' because they both belong to the truthful persona. We +show evidence for the persona hypothesis via two observations: (1) we can probe +whether a model's answer will be truthful before it is generated; (2) +finetuning a model on a set of facts improves its truthfulness on unseen +topics. Next, using arithmetics as a synthetic environment, we show that +language models can separate true and false statements, and generalize +truthfulness across agents; but only if agents in the training data share a +truthful generative process that enables the creation of a truthful persona. +Overall, our findings suggest that models can exploit hierarchical structures +in the data to learn abstract concepts like truthfulness. + +
+
+
+
+
+ + ♻ ☆ On Counterfactual Data Augmentation Under Confounding + + +
+ Counterfactual data augmentation has recently emerged as a method to mitigate +confounding biases in the training data. These biases, such as spurious +correlations, arise due to various observed and unobserved confounding +variables in the data generation process. In this paper, we formally analyze +how confounding biases impact downstream classifiers and present a causal +viewpoint to the solutions based on counterfactual data augmentation. We +explore how removing confounding biases serves as a means to learn invariant +features, ultimately aiding in generalization beyond the observed data +distribution. Additionally, we present a straightforward yet powerful algorithm +for generating counterfactual images, which effectively mitigates the influence +of confounding effects on downstream classifiers. Through experiments on MNIST +variants and the CelebA datasets, we demonstrate how our simple augmentation +method helps existing state-of-the-art methods achieve good results. + +
+
+
+
+
+ + ♻ ☆ Beyond Labeling Oracles: What does it mean to steal ML models? + + +
+ Model extraction attacks are designed to steal trained models with only query +access, as is often provided through APIs that ML-as-a-Service providers offer. +ML models are expensive to train, in part because data is hard to obtain, and a +primary incentive for model extraction is to acquire a model while incurring +less cost than training from scratch. Literature on model extraction commonly +claims or presumes that the attacker is able to save on both data acquisition +and labeling costs. We show that the attacker often does not. This is because +current attacks implicitly rely on the adversary being able to sample from the +victim model's data distribution. We thoroughly evaluate factors influencing +the success of model extraction. We discover that prior knowledge of the +attacker, i.e. access to in-distribution data, dominates other factors like the +attack policy the adversary follows to choose which queries to make to the +victim model API. Thus, an adversary looking to develop an equally capable +model with a fixed budget has little practical incentive to perform model +extraction, since for the attack to work they need to collect in-distribution +data, saving only on the cost of labeling. With low labeling costs in the +current market, the usefulness of such attacks is questionable. Ultimately, we +demonstrate that the effect of prior knowledge needs to be explicitly decoupled +from the attack policy. To this end, we propose a benchmark to evaluate attack +policy directly. + +
+
+
+
+
+ + ♻ ☆ Self supervised convolutional kernel based handcrafted feature + harmonization: Enhanced left ventricle hypertension disease phenotyping on + echocardiography + + +
+ Radiomics, a medical imaging technique, extracts quantitative handcrafted +features from images to predict diseases. Harmonization in those features +ensures consistent feature extraction across various imaging devices and +protocols. Methods for harmonization include standardized imaging protocols, +statistical adjustments, and evaluating feature robustness. Myocardial diseases +such as Left Ventricular Hypertrophy (LVH) and Hypertensive Heart Disease (HHD) +are diagnosed via echocardiography, but variable imaging settings pose +challenges. Harmonization techniques are crucial for applying handcrafted +features in disease diagnosis in such scenario. Self-supervised learning (SSL) +enhances data understanding within limited datasets and adapts to diverse data +settings. ConvNeXt-V2 integrates convolutional layers into SSL, displaying +superior performance in various tasks. This study focuses on convolutional +filters within SSL, using them as preprocessing to convert images into feature +maps for handcrafted feature harmonization. Our proposed method excelled in +harmonization evaluation and exhibited superior LVH classification performance +compared to existing methods. + +
+
+ comment: 11 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Interpreting Black-box Machine Learning Models for High Dimensional + Datasets + + +
+ Deep neural networks (DNNs) have been shown to outperform traditional machine +learning algorithms in a broad variety of application domains due to their +effectiveness in modeling complex problems and handling high-dimensional +datasets. Many real-life datasets, however, are of increasingly high +dimensionality, where a large number of features may be irrelevant for both +supervised and unsupervised learning tasks. The inclusion of such features +would not only introduce unwanted noise but also increase computational +complexity. Furthermore, due to high non-linearity and dependency among a large +number of features, DNN models tend to be unavoidably opaque and perceived as +black-box methods because of their not well-understood internal functioning. +Their algorithmic complexity is often simply beyond the capacities of humans to +understand the interplay among myriads of hyperparameters. A well-interpretable +model can identify statistically significant features and explain the way they +affect the model's outcome. In this paper, we propose an efficient method to +improve the interpretability of black-box models for classification tasks in +the case of high-dimensional datasets. First, we train a black-box model on a +high-dimensional dataset to learn the embeddings on which the classification is +performed. To decompose the inner working principles of the black-box model and +to identify top-k important features, we employ different probing and +perturbing techniques. We then approximate the behavior of the black-box model +by means of an interpretable surrogate model on the top-k feature space. +Finally, we derive decision rules and local explanations from the surrogate +model to explain individual decisions. Our approach outperforms +state-of-the-art methods like TabNet and XGboost when tested on different +datasets with varying dimensionality between 50 and 20,000 w.r.t metrics and +explainability. + +
+
+ comment: This paper is currently under review in a journal +
+
+
+
+
+ + ♻ ☆ Tensor Train for Global Optimization Problems in Robotics + + +
+ The convergence of many numerical optimization techniques is highly dependent +on the initial guess given to the solver. To address this issue, we propose a +novel approach that utilizes tensor methods to initialize existing optimization +solvers near global optima. Our method does not require access to a database of +good solutions. We first transform the cost function, which depends on both +task parameters and optimization variables, into a probability density +function. Unlike existing approaches, the joint probability distribution of the +task parameters and optimization variables is approximated using the Tensor +Train model, which enables efficient conditioning and sampling. We treat the +task parameters as random variables, and for a given task, we generate samples +for decision variables from the conditional distribution to initialize the +optimization solver. Our method can produce multiple solutions (when they +exist) faster than existing methods. We first evaluate the approach on +benchmark functions for numerical optimization that are hard to solve using +gradient-based optimization solvers with a naive initialization. The results +show that the proposed method can generate samples close to global optima and +from multiple modes. We then demonstrate the generality and relevance of our +framework to robotics by applying it to inverse kinematics with obstacles and +motion planning problems with a 7-DoF manipulator. + +
+
+ comment: 25 pages, 21 figures +
+
+
+
+
+ + ♻ ☆ A Data-Free Approach to Mitigate Catastrophic Forgetting in Federated + Class Incremental Learning for Vision Tasks NeurIPS 2023 + + +
+ Deep learning models often suffer from forgetting previously learned +information when trained on new data. This problem is exacerbated in federated +learning (FL), where the data is distributed and can change independently for +each user. Many solutions are proposed to resolve this catastrophic forgetting +in a centralized setting. However, they do not apply directly to FL because of +its unique complexities, such as privacy concerns and resource limitations. To +overcome these challenges, this paper presents a framework for +$\textbf{federated class incremental learning}$ that utilizes a generative +model to synthesize samples from past distributions. This data can be later +exploited alongside the training data to mitigate catastrophic forgetting. To +preserve privacy, the generative model is trained on the server using data-free +methods at the end of each task without requesting data from clients. Moreover, +our solution does not demand the users to store old data or models, which gives +them the freedom to join/leave the training at any time. Additionally, we +introduce SuperImageNet, a new regrouping of the ImageNet dataset specifically +tailored for federated continual learning. We demonstrate significant +improvements compared to existing baselines through extensive experiments on +multiple datasets. + +
+
+ comment: Accepted in NeurIPS 2023. arXiv admin note: text overlap with + arXiv:2307.00497 +
+
+
+
+
+ + ♻ ☆ ROOT-SGD: Sharp Nonasymptotics and Asymptotic Efficiency in a Single + Algorithm COLT 2022 + + +
+ We study the problem of solving strongly convex and smooth unconstrained +optimization problems using stochastic first-order algorithms. We devise a +novel algorithm, referred to as \emph{Recursive One-Over-T SGD} (\ROOTSGD), +based on an easily implementable, recursive averaging of past stochastic +gradients. We prove that it simultaneously achieves state-of-the-art +performance in both a finite-sample, nonasymptotic sense and an asymptotic +sense. On the nonasymptotic side, we prove risk bounds on the last iterate of +\ROOTSGD with leading-order terms that match the optimal statistical risk with +a unity pre-factor, along with a higher-order term that scales at the sharp +rate of $O(n^{-3/2})$ under the Lipschitz condition on the Hessian matrix. On +the asymptotic side, we show that when a mild, one-point Hessian continuity +condition is imposed, the rescaled last iterate of (multi-epoch) \ROOTSGD +converges asymptotically to a Gaussian limit with the Cram\'{e}r-Rao optimal +asymptotic covariance, for a broad range of step-size choices. + +
+
+ comment: Camera Ready, COLT 2022 +
+
+
+
+
+ + ♻ ☆ Towards Data-Algorithm Dependent Generalization: a Case Study on + Overparameterized Linear Regression + + +
+ One of the major open problems in machine learning is to characterize +generalization in the overparameterized regime, where most traditional +generalization bounds become inconsistent even for overparameterized linear +regression. In many scenarios, this failure can be attributed to obscuring the +crucial interplay between the training algorithm and the underlying data +distribution. This paper demonstrate that the generalization behavior of +overparameterized model should be analyzed in a both data-relevant and +algorithm-relevant manner. To make a formal characterization, We introduce a +notion called data-algorithm compatibility, which considers the generalization +behavior of the entire data-dependent training trajectory, instead of +traditional last-iterate analysis. We validate our claim by studying the +setting of solving overparameterized linear regression with gradient descent. +Specifically, we perform a data-dependent trajectory analysis and derive a +sufficient condition for compatibility in such a setting. Our theoretical +results demonstrate that if we take early stopping iterates into consideration, +generalization can hold with significantly weaker restrictions on the problem +instance than the previous last-iterate analysis. + +
+
+
+
+
+ + ♻ ☆ Digital Twin Assisted Deep Reinforcement Learning for Online Admission + Control in Sliced Network + + +
+ The proliferation of diverse wireless services in 5G and beyond has led to +the emergence of network slicing technologies. Among these, admission control +plays a crucial role in achieving service-oriented optimization goals through +the selective acceptance of service requests. Although deep reinforcement +learning (DRL) forms the foundation in many admission control approaches thanks +to its effectiveness and flexibility, initial instability with excessive +convergence delay of DRL models hinders their deployment in real-world +networks. We propose a digital twin (DT) accelerated DRL solution to address +this issue. Specifically, we first formulate the admission decision-making +process as a semi-Markov decision process, which is subsequently simplified +into an equivalent discrete-time Markov decision process to facilitate the +implementation of DRL methods. A neural network-based DT is established with a +customized output layer for queuing systems, trained through supervised +learning, and then employed to assist the training phase of the DRL model. +Extensive simulations show that the DT-accelerated DRL improves resource +utilization by over 40% compared to the directly trained state-of-the-art +dueling deep Q-learning model. This improvement is achieved while preserving +the model's capability to optimize the long-term rewards of the admission +process. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ LASER: A Neuro-Symbolic Framework for Learning Spatial-Temporal Scene + Graphs with Weak Supervision + + +
+ We propose LASER, a neuro-symbolic approach to learn semantic video +representations that capture rich spatial and temporal properties in video data +by leveraging high-level logic specifications. In particular, we formulate the +problem in terms of alignment between raw videos and spatio-temporal logic +specifications. The alignment algorithm leverages a differentiable symbolic +reasoner and a combination of contrastive, temporal, and semantics losses. It +effectively and efficiently trains low-level perception models to extract +fine-grained video representation in the form of a spatio-temporal scene graph +that conforms to the desired high-level specification. In doing so, we explore +a novel methodology that weakly supervises the learning of video semantic +representations through logic specifications. We evaluate our method on two +datasets with rich spatial and temporal specifications: +20BN-Something-Something and MUGEN. We demonstrate that our method learns +better fine-grained video semantics than existing baselines. + +
+
+
+
+
+ + ♻ ☆ Long-term Causal Effects Estimation via Latent Surrogates Representation + Learning + + +
+ Estimating long-term causal effects based on short-term surrogates is a +significant but challenging problem in many real-world applications, e.g., +marketing and medicine. Despite its success in certain domains, most existing +methods estimate causal effects in an idealistic and simplistic way - ignoring +the causal structure among short-term outcomes and treating all of them as +surrogates. However, such methods cannot be well applied to real-world +scenarios, in which the partially observed surrogates are mixed with their +proxies among short-term outcomes. To this end, we develop our flexible method, +Laser, to estimate long-term causal effects in the more realistic situation +that the surrogates are observed or have observed proxies.Given the +indistinguishability between the surrogates and proxies, we utilize +identifiable variational auto-encoder (iVAE) to recover the whole valid +surrogates on all the surrogates candidates without the need of distinguishing +the observed surrogates or the proxies of latent surrogates. With the help of +the recovered surrogates, we further devise an unbiased estimation of long-term +causal effects. Extensive experimental results on the real-world and +semi-synthetic datasets demonstrate the effectiveness of our proposed method. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis (ABSA): Domains, + Methods, and Trends + + +
+ Aspect-based Sentiment Analysis (ABSA) is a type of fine-grained sentiment +analysis (SA) that identifies aspects and the associated opinions from a given +text. In the digital era, ABSA gained increasing popularity and applications in +mining opinionated text data to obtain insights and support decisions. ABSA +research employs linguistic, statistical, and machine-learning approaches and +utilises resources such as labelled datasets, aspect and sentiment lexicons and +ontology. By its nature, ABSA is domain-dependent and can be sensitive to the +impact of misalignment between the resource and application domains. However, +to our knowledge, this topic has not been explored by the existing ABSA +literature reviews. In this paper, we present a Systematic Literature Review +(SLR) of ABSA studies with a focus on the research application domain, dataset +domain, and the research methods to examine their relationships and identify +trends over time. Our results suggest a number of potential systemic issues in +the ABSA research literature, including the predominance of the +``product/service review'' dataset domain among the majority of studies that +did not have a specific research application domain, coupled with the +prevalence of dataset-reliant methods such as supervised machine learning. This +review makes a number of unique contributions to the ABSA research field: 1) To +our knowledge, it is the first SLR that links the research domain, dataset +domain, and research method through a systematic perspective; 2) it is one of +the largest scoped SLR on ABSA, with 519 eligible studies filtered from 4191 +search results without time constraint; and 3) our review methodology adopted +an innovative automatic filtering process based on PDF-mining, which enhanced +screening quality and reliability. Suggestions and our review limitations are +also discussed. + +
+
+
+
+
+ + ♻ ☆ Exponentially Faster Language Modelling + + +
+ Language models only really need to use an exponential fraction of their +neurons for individual inferences. As proof, we present UltraFastBERT, a BERT +variant that uses 0.3% of its neurons during inference while performing on par +with similar BERT models. UltraFastBERT selectively engages just 12 out of 4095 +neurons for each layer inference. This is achieved by replacing feedforward +networks with fast feedforward networks (FFFs). While no truly efficient +implementation currently exists to unlock the full acceleration potential of +conditional neural execution, we provide high-level CPU code achieving 78x +speedup over the optimized baseline feedforward implementation, and a PyTorch +implementation delivering 40x speedup over the equivalent batched feedforward +inference. We publish our training code, benchmarking setup, and model weights. + +
+
+
+
+
+ + ♻ ☆ Verified Compositional Neuro-Symbolic Control for Stochastic Systems + with Temporal Logic Tasks + + +
+ Several methods have been proposed recently to learn neural network (NN) +controllers for autonomous agents, with unknown and stochastic dynamics, tasked +with complex missions captured by Linear Temporal Logic (LTL). Due to the +sample-inefficiency of the majority of these works, compositional learning +methods have been proposed decomposing the LTL specification into smaller +sub-tasks. Then, separate controllers are learned and composed to satisfy the +original task. A key challenge within these approaches is that they often lack +safety guarantees or the provided guarantees are impractical. This paper aims +to address this challenge. Particularly, we consider autonomous systems with +unknown and stochastic dynamics and LTL-encoded tasks. We assume that the +system is equipped with a finite set of base skills modeled by trained NN +feedback controllers. Our goal is to check if there exists a temporal +composition of the trained NN controllers - and if so, to compute it - that +will yield a composite system behavior that satisfies the assigned LTL task +with probability one. We propose a new approach that relies on a novel +integration of automata theory and data-driven reachability analysis tools for +NN-controlled stochastic systems. The resulting neuro-symbolic controller +allows the agent to generate safe behaviors for unseen complex temporal logic +tasks in a zero-shot fashion by leveraging its base skills. We show correctness +of the proposed method and we provide conditions under which it is complete. To +the best of our knowledge, this is the first work that designs verified +temporal compositions of NN controllers for unknown and stochastic systems. +Finally, we provide extensive numerical simulations and hardware experiments on +robot navigation tasks to demonstrate the proposed method. + +
+
+ comment: The paper was withdrawn as it did not include the correct author + list, credit was given to the wrong author +
+
+
+
+
+ + ♻ ☆ Better with Less: A Data-Active Perspective on Pre-Training Graph Neural + Networks + + +
+ Pre-training on graph neural networks (GNNs) aims to learn transferable +knowledge for downstream tasks with unlabeled data, and it has recently become +an active research area. The success of graph pre-training models is often +attributed to the massive amount of input data. In this paper, however, we +identify the curse of big data phenomenon in graph pre-training: more training +data do not necessarily lead to better downstream performance. Motivated by +this observation, we propose a better-with-less framework for graph +pre-training: fewer, but carefully chosen data are fed into a GNN model to +enhance pre-training. The proposed pre-training pipeline is called the +data-active graph pre-training (APT) framework, and is composed of a graph +selector and a pre-training model. The graph selector chooses the most +representative and instructive data points based on the inherent properties of +graphs as well as predictive uncertainty. The proposed predictive uncertainty, +as feedback from the pre-training model, measures the confidence level of the +model in the data. When fed with the chosen data, on the other hand, the +pre-training model grasps an initial understanding of the new, unseen data, and +at the same time attempts to remember the knowledge learned from previous data. +Therefore, the integration and interaction between these two components form a +unified framework (APT), in which graph pre-training is performed in a +progressive and iterative way. Experiment results show that the proposed APT is +able to obtain an efficient pre-training model with fewer training data and +better downstream performance. + +
+
+
+
+
+ + ♻ ☆ Multifidelity Deep Operator Networks For Data-Driven and + Physics-Informed Problems + + +
+ Operator learning for complex nonlinear systems is increasingly common in +modeling multi-physics and multi-scale systems. However, training such +high-dimensional operators requires a large amount of expensive, high-fidelity +data, either from experiments or simulations. In this work, we present a +composite Deep Operator Network (DeepONet) for learning using two datasets with +different levels of fidelity to accurately learn complex operators when +sufficient high-fidelity data is not available. Additionally, we demonstrate +that the presence of low-fidelity data can improve the predictions of +physics-informed learning with DeepONets. We demonstrate the new multi-fidelity +training in diverse examples, including modeling of the ice-sheet dynamics of +the Humboldt glacier, Greenland, using two different fidelity models and also +using the same physical model at two different resolutions. + +
+
+
+
+
+ + ♻ ☆ Stacked networks improve physics-informed training: applications to + neural networks and deep operator networks + + +
+ Physics-informed neural networks and operator networks have shown promise for +effectively solving equations modeling physical systems. However, these +networks can be difficult or impossible to train accurately for some systems of +equations. We present a novel multifidelity framework for stacking +physics-informed neural networks and operator networks that facilitates +training. We successively build a chain of networks, where the output at one +step can act as a low-fidelity input for training the next step, gradually +increasing the expressivity of the learned model. The equations imposed at each +step of the iterative process can be the same or different (akin to simulated +annealing). The iterative (stacking) nature of the proposed method allows us to +progressively learn features of a solution that are hard to learn directly. +Through benchmark problems including a nonlinear pendulum, the wave equation, +and the viscous Burgers equation, we show how stacking can be used to improve +the accuracy and reduce the required size of physics-informed neural networks +and operator networks. + +
+
+
+
+
+ + ♻ ☆ Supervised and Unsupervised Deep Learning Approaches for EEG Seizure + Prediction + + +
+ Epilepsy affects more than 50 million people worldwide, making it one of the +world's most prevalent neurological diseases. The main symptom of epilepsy is +seizures, which occur abruptly and can cause serious injury or death. The +ability to predict the occurrence of an epileptic seizure could alleviate many +risks and stresses people with epilepsy face. We formulate the problem of +detecting preictal (or pre-seizure) with reference to normal EEG as a precursor +to incoming seizure. To this end, we developed several supervised deep learning +approaches model to identify preictal EEG from normal EEG. We further develop +novel unsupervised deep learning approaches to train the models on only normal +EEG, and detecting pre-seizure EEG as an anomalous event. These deep learning +models were trained and evaluated on two large EEG seizure datasets in a +person-specific manner. We found that both supervised and unsupervised +approaches are feasible; however, their performance varies depending on the +patient, approach and architecture. This new line of research has the potential +to develop therapeutic interventions and save human lives. + +
+
+ comment: 16 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Causal Reinforcement Learning: A Survey + + +
+ Reinforcement learning is an essential paradigm for solving sequential +decision problems under uncertainty. Despite many remarkable achievements in +recent decades, applying reinforcement learning methods in the real world +remains challenging. One of the main obstacles is that reinforcement learning +agents lack a fundamental understanding of the world and must therefore learn +from scratch through numerous trial-and-error interactions. They may also face +challenges in providing explanations for their decisions and generalizing the +acquired knowledge. Causality, however, offers a notable advantage as it can +formalize knowledge in a systematic manner and leverage invariance for +effective knowledge transfer. This has led to the emergence of causal +reinforcement learning, a subfield of reinforcement learning that seeks to +enhance existing algorithms by incorporating causal relationships into the +learning process. In this survey, we comprehensively review the literature on +causal reinforcement learning. We first introduce the basic concepts of +causality and reinforcement learning, and then explain how causality can +address core challenges in non-causal reinforcement learning. We categorize and +systematically review existing causal reinforcement learning approaches based +on their target problems and methodologies. Finally, we outline open issues and +future directions in this emerging field. + +
+
+ comment: 52 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Data is often loadable in short depth: Quantum circuits from tensor + networks for finance, images, fluids, and proteins + + +
+ Though there has been substantial progress in developing quantum algorithms +to study classical datasets, the cost of simply loading classical data is an +obstacle to quantum advantage. When the amplitude encoding is used, loading an +arbitrary classical vector requires up to exponential circuit depths with +respect to the number of qubits. Here, we address this "input problem" with two +contributions. First, we introduce a circuit compilation method based on tensor +network (TN) theory. Our method -- AMLET (Automatic Multi-layer Loader +Exploiting TNs) -- proceeds via careful construction of a specific TN topology +and can be tailored to arbitrary circuit depths. Second, we perform numerical +experiments on real-world classical data from four distinct areas: finance, +images, fluid mechanics, and proteins. To the best of our knowledge, this is +the broadest numerical analysis to date of loading classical data into a +quantum computer. Consistent with other recent work in this area, the required +circuit depths are often several orders of magnitude lower than the +exponentially-scaling general loading algorithm would require. Besides +introducing a more efficient loading algorithm, this work demonstrates that +many classical datasets are loadable in depths that are much shorter than +previously expected, which has positive implications for speeding up classical +workloads on quantum computers. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Multi-Resolution Diffusion for Privacy-Sensitive Recommender Systems + + +
+ While recommender systems have become an integral component of the Web +experience, their heavy reliance on user data raises privacy and security +concerns. Substituting user data with synthetic data can address these +concerns, but accurately replicating these real-world datasets has been a +notoriously challenging problem. Recent advancements in generative AI have +demonstrated the impressive capabilities of diffusion models in generating +realistic data across various domains. In this work we introduce a Score-based +Diffusion Recommendation Module (SDRM), which captures the intricate patterns +of real-world datasets required for training highly accurate recommender +systems. SDRM allows for the generation of synthetic data that can replace +existing datasets to preserve user privacy, or augment existing datasets to +address excessive data sparsity. Our method outperforms competing baselines +such as generative adversarial networks, variational autoencoders, and recently +proposed diffusion models in synthesizing various datasets to replace or +augment the original data by an average improvement of 4.30% in Recall@$k$ and +4.65% in NDCG@$k$. + +
+
+ comment: 10 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Absolute Policy Optimization + + +
+ In recent years, trust region on-policy reinforcement learning has achieved +impressive results in addressing complex control tasks and gaming scenarios. +However, contemporary state-of-the-art algorithms within this category +primarily emphasize improvement in expected performance, lacking the ability to +control over the worst-case performance outcomes. To address this limitation, +we introduce a novel objective function; by optimizing which, it will lead to +guaranteed monotonic improvement in the lower bound of near-total performance +samples (absolute performance). Considering this groundbreaking theoretical +advancement, we then refine this theoretically grounded algorithm through a +series of approximations, resulting in a practical solution called Absolute +Policy Optimization (APO). Our experiments demonstrate the effectiveness of our +approach across challenging continuous control benchmark tasks and extend its +applicability to mastering Atari games. Our findings reveal that APO +significantly outperforms state-of-the-art policy gradient algorithms, +resulting in substantial improvements in both expected performance and +worst-case performance. + +
+
+ comment: I submitted this article to Journal of Machine Learning Research. The + manuscript will go under a major revision and I don't want the reviewer know + who I am. I will re-upload after JMLR review released +
+
+
+
+
+ + ♻ ☆ Extraction and Summarization of Explicit Video Content using Multi-Modal + Deep Learning + + +
+ With the increase in video-sharing platforms across the internet, it is +difficult for humans to moderate the data for explicit content. Hence, an +automated pipeline to scan through video data for explicit content has become +the need of the hour. We propose a novel pipeline that uses multi-modal deep +learning to first extract the explicit segments of input videos and then +summarize their content using text to determine its age appropriateness and age +rating. We also evaluate our pipeline's effectiveness in the end using standard +metrics. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ A Randomized Approach for Tight Privacy Accounting NeurIPS 2023 + + +
+ Bounding privacy leakage over compositions, i.e., privacy accounting, is a +key challenge in differential privacy (DP). The privacy parameter ($\eps$ or +$\delta$) is often easy to estimate but hard to bound. In this paper, we +propose a new differential privacy paradigm called estimate-verify-release +(EVR), which addresses the challenges of providing a strict upper bound for +privacy parameter in DP compositions by converting an estimate of privacy +parameter into a formal guarantee. The EVR paradigm first estimates the privacy +parameter of a mechanism, then verifies whether it meets this guarantee, and +finally releases the query output based on the verification result. The core +component of the EVR is privacy verification. We develop a randomized privacy +verifier using Monte Carlo (MC) technique. Furthermore, we propose an MC-based +DP accountant that outperforms existing DP accounting techniques in terms of +accuracy and efficiency. Our empirical evaluation shows the newly proposed EVR +paradigm improves the utility-privacy tradeoff for privacy-preserving machine +learning. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Why Shallow Networks Struggle with Approximating and Learning High + Frequency: A Numerical Study + + +
+ In this work, a comprehensive numerical study involving analysis and +experiments shows why a two-layer neural network has difficulties handling high +frequencies in approximation and learning when machine precision and +computation cost are important factors in real practice. In particular, the +following basic computational issues are investigated: (1) the minimal +numerical error one can achieve given a finite machine precision, (2) the +computation cost to achieve a given accuracy, and (3) stability with respect to +perturbations. The key to the study is the conditioning of the representation +and its learning dynamics. Explicit answers to the above questions with +numerical verifications are presented. + +
+
+
+
+
+ + ♻ ☆ GeoCLIP: Clip-Inspired Alignment between Locations and Images for + Effective Worldwide Geo-localization NeurIPS 2023 + + +
+ Worldwide Geo-localization aims to pinpoint the precise location of images +taken anywhere on Earth. This task has considerable challenges due to immense +variation in geographic landscapes. The image-to-image retrieval-based +approaches fail to solve this problem on a global scale as it is not feasible +to construct a large gallery of images covering the entire world. Instead, +existing approaches divide the globe into discrete geographic cells, +transforming the problem into a classification task. However, their performance +is limited by the predefined classes and often results in inaccurate +localizations when an image's location significantly deviates from its class +center. To overcome these limitations, we propose GeoCLIP, a novel +CLIP-inspired Image-to-GPS retrieval approach that enforces alignment between +the image and its corresponding GPS locations. GeoCLIP's location encoder +models the Earth as a continuous function by employing positional encoding +through random Fourier features and constructing a hierarchical representation +that captures information at varying resolutions to yield a semantically rich +high-dimensional feature suitable to use even beyond geo-localization. To the +best of our knowledge, this is the first work employing GPS encoding for +geo-localization. We demonstrate the efficacy of our method via extensive +experiments and ablations on benchmark datasets. We achieve competitive +performance with just 20% of training data, highlighting its effectiveness even +in limited-data settings. Furthermore, we qualitatively demonstrate +geo-localization using a text query by leveraging CLIP backbone of our image +encoder. The project webpage is available at: +https://vicentevivan.github.io/GeoCLIP + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Using Taylor-Approximated Gradients to Improve the Frank-Wolfe Method + for Empirical Risk Minimization + + +
+ The Frank-Wolfe method has become increasingly useful in statistical and +machine learning applications, due to the structure-inducing properties of the +iterates, and especially in settings where linear minimization over the +feasible set is more computationally efficient than projection. In the setting +of Empirical Risk Minimization -- one of the fundamental optimization problems +in statistical and machine learning -- the computational effectiveness of +Frank-Wolfe methods typically grows linearly in the number of data observations +$n$. This is in stark contrast to the case for typical stochastic projection +methods. In order to reduce this dependence on $n$, we look to second-order +smoothness of typical smooth loss functions (least squares loss and logistic +loss, for example) and we propose amending the Frank-Wolfe method with Taylor +series-approximated gradients, including variants for both deterministic and +stochastic settings. Compared with current state-of-the-art methods in the +regime where the optimality tolerance $\varepsilon$ is sufficiently small, our +methods are able to simultaneously reduce the dependence on large $n$ while +obtaining optimal convergence rates of Frank-Wolfe methods, in both the convex +and non-convex settings. We also propose a novel adaptive step-size approach +for which we have computational guarantees. Last of all, we present +computational experiments which show that our methods exhibit very significant +speed-ups over existing methods on real-world datasets for both convex and +non-convex binary classification problems. + +
+
+ comment: 30 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Advancements in Generative AI: A Comprehensive Review of GANs, GPT, + Autoencoders, Diffusion Model, and Transformers + + +
+ The launch of ChatGPT has garnered global attention, marking a significant +milestone in the field of Generative Artificial Intelligence. While Generative +AI has been in effect for the past decade, the introduction of ChatGPT has +ignited a new wave of research and innovation in the AI domain. This surge in +interest has led to the development and release of numerous cutting-edge tools, +such as Bard, Stable Diffusion, DALL-E, Make-A-Video, Runway ML, and Jukebox, +among others. These tools exhibit remarkable capabilities, encompassing tasks +ranging from text generation and music composition, image creation, video +production, code generation, and even scientific work. They are built upon +various state-of-the-art models, including Stable Diffusion, transformer models +like GPT-3 (recent GPT-4), variational autoencoders, and generative adversarial +networks. This advancement in Generative AI presents a wealth of exciting +opportunities and, simultaneously, unprecedented challenges. Throughout this +paper, we have explored these state-of-the-art models, the diverse array of +tasks they can accomplish, the challenges they pose, and the promising future +of Generative Artificial Intelligence. + +
+
+
+
+
+ + ♻ ☆ HINT: Healthy Influential-Noise based Training to Defend against Data + Poisoning Attacks + + +
+ While numerous defense methods have been proposed to prohibit potential +poisoning attacks from untrusted data sources, most research works only defend +against specific attacks, which leaves many avenues for an adversary to +exploit. In this work, we propose an efficient and robust training approach to +defend against data poisoning attacks based on influence functions, named +Healthy Influential-Noise based Training. Using influence functions, we craft +healthy noise that helps to harden the classification model against poisoning +attacks without significantly affecting the generalization ability on test +data. In addition, our method can perform effectively when only a subset of the +training data is modified, instead of the current method of adding noise to all +examples that has been used in several previous works. We conduct comprehensive +evaluations over two image datasets with state-of-the-art poisoning attacks +under different realistic attack scenarios. Our empirical results show that +HINT can efficiently protect deep learning models against the effect of both +untargeted and targeted poisoning attacks. + +
+
+
+
+
+ + ♻ ☆ Epsilon*: Privacy Metric for Machine Learning Models + + +
+ We introduce Epsilon*, a new privacy metric for measuring the privacy risk of +a single model instance prior to, during, or after deployment of privacy +mitigation strategies. The metric requires only black-box access to model +predictions, does not require training data re-sampling or model re-training, +and can be used to measure the privacy risk of models not trained with +differential privacy. Epsilon* is a function of true positive and false +positive rates in a hypothesis test used by an adversary in a membership +inference attack. We distinguish between quantifying the privacy loss of a +trained model instance, which we refer to as empirical privacy, and quantifying +the privacy loss of the training mechanism which produces this model instance. +Existing approaches in the privacy auditing literature provide lower bounds for +the latter, while our metric provides an empirical lower bound for the former +by relying on an (${\epsilon}$, ${\delta}$)-type of quantification of the +privacy of the trained model instance. We establish a relationship between +these lower bounds and show how to implement Epsilon* to avoid numerical and +noise amplification instability. We further show in experiments on benchmark +public data sets that Epsilon* is sensitive to privacy risk mitigation by +training with differential privacy (DP), where the value of Epsilon* is reduced +by up to 800% compared to the Epsilon* values of non-DP trained baseline +models. This metric allows privacy auditors to be independent of model owners, +and enables visualizing the privacy-utility landscape to make informed +decisions regarding the trade-offs between model privacy and utility. + +
+
+
+
+
+ + ♻ ☆ Reliable Generation of EHR Time Series via Diffusion Models + + +
+ Electronic Health Records (EHRs) are rich sources of patient-level data, +including laboratory tests, medications, and diagnoses, offering valuable +resources for medical data analysis. However, concerns about privacy often +restrict access to EHRs, hindering downstream analysis. Researchers have +explored various methods for generating privacy-preserving EHR data. In this +study, we introduce a new method for generating diverse and realistic synthetic +EHR time series data using Denoising Diffusion Probabilistic Models (DDPM). We +conducted experiments on six datasets, comparing our proposed method with eight +existing methods. Our results demonstrate that our approach significantly +outperforms all existing methods in terms of data utility while requiring less +training effort. Our approach also enhances downstream medical data analysis by +providing diverse and realistic synthetic EHR data. + +
+
+
+
+
+ + ♻ ☆ Inferring Actual Treatment Pathways from Patient Records + + +
+ Treatment pathways are step-by-step plans outlining the recommended medical +care for specific diseases; they get revised when different treatments are +found to improve patient outcomes. Examining health records is an important +part of this revision process, but inferring patients' actual treatments from +health data is challenging due to complex event-coding schemes and the absence +of pathway-related annotations. This study aims to infer the actual treatment +steps for a particular patient group from administrative health records (AHR) - +a common form of tabular healthcare data - and address several technique- and +methodology-based gaps in treatment pathway-inference research. We introduce +Defrag, a method for examining AHRs to infer the real-world treatment steps for +a particular patient group. Defrag learns the semantic and temporal meaning of +healthcare event sequences, allowing it to reliably infer treatment steps from +complex healthcare data. To our knowledge, Defrag is the first +pathway-inference method to utilise a neural network (NN), an approach made +possible by a novel, self-supervised learning objective. We also developed a +testing and validation framework for pathway inference, which we use to +characterise and evaluate Defrag's pathway inference ability and compare +against baselines. We demonstrate Defrag's effectiveness by identifying +best-practice pathway fragments for breast cancer, lung cancer, and melanoma in +public healthcare records. Additionally, we use synthetic data experiments to +demonstrate the characteristics of the Defrag method, and to compare Defrag to +several baselines where it significantly outperforms non-NN-based methods. +Defrag significantly outperforms several existing pathway-inference methods and +offers an innovative and effective approach for inferring treatment pathways +from AHRs. Open-source code is provided to encourage further research in this +area. + +
+
+
+
+
+ + ♻ ☆ Stability and Generalization of Stochastic Compositional Gradient + Descent Algorithms + + +
+ Many machine learning tasks can be formulated as a stochastic compositional +optimization (SCO) problem such as reinforcement learning, AUC maximization, +and meta-learning, where the objective function involves a nested composition +associated with an expectation. While a significant amount of studies has been +devoted to studying the convergence behavior of SCO algorithms, there is little +work on understanding their generalization, i.e., how these learning algorithms +built from training examples would behave on future test examples. In this +paper, we provide the stability and generalization analysis of stochastic +compositional gradient descent algorithms through the lens of algorithmic +stability in the framework of statistical learning theory. Firstly, we +introduce a stability concept called compositional uniform stability and +establish its quantitative relation with generalization for SCO problems. Then, +we establish the compositional uniform stability results for two popular +stochastic compositional gradient descent algorithms, namely SCGD and SCSC. +Finally, we derive dimension-independent excess risk bounds for SCGD and SCSC +by trade-offing their stability results and optimization errors. To the best of +our knowledge, these are the first-ever-known results on stability and +generalization analysis of stochastic compositional gradient descent +algorithms. + +
+
+
+
+
+ + ♻ ☆ Safety-aware Causal Representation for Trustworthy Reinforcement + Learning in Autonomous Driving + + +
+ In the domain of autonomous driving, the Learning from Demonstration (LfD) +paradigm has exhibited notable efficacy in addressing sequential +decision-making problems. However, consistently achieving safety in varying +traffic contexts, especially in safety-critical scenarios, poses a significant +challenge due to the long-tailed and unforeseen scenarios absent from offline +datasets. In this paper, we introduce the saFety-aware strUctured Scenario +representatION (FUSION), a pioneering methodology conceived to facilitate the +learning of an adaptive end-to-end driving policy by leveraging structured +scenario information. FUSION capitalizes on the causal relationships between +decomposed reward, cost, state, and action space, constructing a framework for +structured sequential reasoning under dynamic traffic environments. We conduct +rigorous evaluations in two typical real-world settings of distribution shift +in autonomous vehicles, demonstrating the good balance between safety cost and +utility reward of FUSION compared to contemporary state-of-the-art safety-aware +LfD baselines. Empirical evidence under diverse driving scenarios attests that +FUSION significantly enhances the safety and generalizability of autonomous +driving agents, even in the face of challenging and unseen environments. +Furthermore, our ablation studies reveal noticeable improvements in the +integration of causal representation into the safe offline RL problem. + +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Towards Natural Language-Guided Drones: GeoText-1652 Benchmark with + Spatially Relation Matching + + +
+ Drone navigation through natural language commands remains a significant +challenge due to the lack of publicly available multi-modal datasets and the +intricate demands of fine-grained visual-text alignment. In response to this +pressing need, we present a new human-computer interaction annotation benchmark +called GeoText-1652, meticulously curated through a robust Large Language Model +(LLM)-based data generation framework and the expertise of pre-trained vision +models. This new dataset seamlessly extends the existing image dataset, \ie, +University-1652, with spatial-aware text annotations, encompassing intricate +image-text-bounding box associations. Besides, we introduce a new optimization +objective to leverage fine-grained spatial associations, called blending +spatial matching, for region-level spatial relation matching. Extensive +experiments reveal that our approach maintains an exceptional recall rate under +varying description complexities. This underscores the promising potential of +our approach in elevating drone control and navigation through the seamless +integration of natural language commands in real-world scenarios. + +
+
+ comment: 10 pages, 6 figures +
+
+
+
+
+ + ☆ HierSpeech++: Bridging the Gap between Semantic and Acoustic + Representation of Speech by Hierarchical Variational Inference for Zero-shot + Speech Synthesis + + +
+ Large language models (LLM)-based speech synthesis has been widely adopted in +zero-shot speech synthesis. However, they require a large-scale data and +possess the same limitations as previous autoregressive speech models, +including slow inference speed and lack of robustness. This paper proposes +HierSpeech++, a fast and strong zero-shot speech synthesizer for text-to-speech +(TTS) and voice conversion (VC). We verified that hierarchical speech synthesis +frameworks could significantly improve the robustness and expressiveness of the +synthetic speech. Furthermore, we significantly improve the naturalness and +speaker similarity of synthetic speech even in zero-shot speech synthesis +scenarios. For text-to-speech, we adopt the text-to-vec framework, which +generates a self-supervised speech representation and an F0 representation +based on text representations and prosody prompts. Then, HierSpeech++ generates +speech from the generated vector, F0, and voice prompt. We further introduce a +high-efficient speech super-resolution framework from 16 kHz to 48 kHz. The +experimental results demonstrated that the hierarchical variational autoencoder +could be a strong zero-shot speech synthesizer given that it outperforms +LLM-based and diffusion-based models. Moreover, we achieved the first +human-level quality zero-shot speech synthesis. Audio samples and source code +are available at https://github.com/sh-lee-prml/HierSpeechpp. + +
+
+ comment: 16 pages, 9 figures, 12 tables +
+
+
+
+
+ + ☆ CASR: Refining Action Segmentation via Magrinalizing Frame-levle Causal + Relationships + + +
+ Integrating deep learning and causal discovery has increased the +interpretability of Temporal Action Segmentation (TAS) tasks. However, +frame-level causal relationships exist many complicated noises outside the +segment-level, making it infeasible to directly express macro action semantics. +Thus, we propose \textit{\textbf{Causal Abstraction Segmentation Refiner +(CASR)}}, which can refine TAS results from various models by enhancing video +causality in marginalizing frame-level casual relationships. Specifically, we +define the equivalent frame-level casual model and segment-level causal model, +so that the causal adjacency matrix constructed from marginalized frame-level +causal relationships has the ability to represent the segmnet-level causal +relationships. CASR works out by reducing the difference in the causal +adjacency matrix between we constructed and pre-segmentation results of +backbone models. In addition, we propose a novel evaluation metric Causal Edit +Distance (CED) to evaluate the causal interpretability. Extensive experimental +results on mainstream datasets indicate that CASR significantly surpasses +existing various methods in action segmentation performance, as well as in +causal explainability and generalization. Our code will be available soon. + +
+
+
+
+
+ + ☆ Equipping Pretrained Unconditional Music Transformers with Instrument + and Genre Controls + + +
+ The ''pretraining-and-finetuning'' paradigm has become a norm for training +domain-specific models in natural language processing and computer vision. In +this work, we aim to examine this paradigm for symbolic music generation +through leveraging the largest ever symbolic music dataset sourced from the +MuseScore forum. We first pretrain a large unconditional transformer model +using 1.5 million songs. We then propose a simple technique to equip this +pretrained unconditional music transformer model with instrument and genre +controls by finetuning the model with additional control tokens. Our proposed +representation offers improved high-level controllability and expressiveness +against two existing representations. The experimental results show that the +proposed model can successfully generate music with user-specified instruments +and genre. In a subjective listening test, the proposed model outperforms the +pretrained baseline model in terms of coherence, harmony, arrangement and +overall quality. + +
+
+
+
+
+ + ☆ Attribute-Aware Deep Hashing with Self-Consistency for Large-Scale + Fine-Grained Image Retrieval + + +
+ Our work focuses on tackling large-scale fine-grained image retrieval as +ranking the images depicting the concept of interests (i.e., the same +sub-category labels) highest based on the fine-grained details in the query. It +is desirable to alleviate the challenges of both fine-grained nature of small +inter-class variations with large intra-class variations and explosive growth +of fine-grained data for such a practical task. In this paper, we propose +attribute-aware hashing networks with self-consistency for generating +attribute-aware hash codes to not only make the retrieval process efficient, +but also establish explicit correspondences between hash codes and visual +attributes. Specifically, based on the captured visual representations by +attention, we develop an encoder-decoder structure network of a reconstruction +task to unsupervisedly distill high-level attribute-specific vectors from the +appearance-specific visual representations without attribute annotations. Our +models are also equipped with a feature decorrelation constraint upon these +attribute vectors to strengthen their representative abilities. Then, driven by +preserving original entities' similarity, the required hash codes can be +generated from these attribute-specific vectors and thus become +attribute-aware. Furthermore, to combat simplicity bias in deep hashing, we +consider the model design from the perspective of the self-consistency +principle and propose to further enhance models' self-consistency by equipping +an additional image reconstruction path. Comprehensive quantitative experiments +under diverse empirical settings on six fine-grained retrieval datasets and two +generic retrieval datasets show the superiority of our models over competing +methods. + +
+
+ comment: Accepted by IEEE TPAMI +
+
+
+
+
+ + ♻ ☆ Enhancing Multi-modal Cooperation via Fine-grained Modality Valuation + + +
+ One primary topic of multi-modal learning is to jointly incorporate +heterogeneous information from different modalities. However, most models often +suffer from unsatisfactory multi-modal cooperation, which could not jointly +utilize all modalities well. Some methods are proposed to identify and enhance +the worse learnt modality, but are often hard to provide the fine-grained +observation of multi-modal cooperation at sample-level with theoretical +support. Hence, it is essential to reasonably observe and improve the +fine-grained cooperation between modalities, especially when facing realistic +scenarios where the modality discrepancy could vary across different samples. +To this end, we introduce a fine-grained modality valuation metric to evaluate +the contribution of each modality at sample-level. Via modality valuation, we +regretfully observe that the multi-modal model tends to rely on one specific +modality, resulting in other modalities being low-contributing. We further +analyze this issue and improve cooperation between modalities by enhancing the +discriminative ability of low-contributing modalities in a targeted manner. +Overall, our methods reasonably observe the fine-grained uni-modal contribution +at sample-level and achieve considerable improvement on different multi-modal +models. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Exploring User Perceptions of Virtual Reality Scene Design in Metaverse + Learning Environments + + +
+ Metaverse learning environments allow for a seamless and intuitive transition +between activities compared to Virtual Reality (VR) learning environments, due +to their interconnected design. The design of VR scenes is important for +creating effective learning experiences in the Metaverse. However, there is +limited research on the impact of different design elements on user's learning +experiences in VR scenes. To address this, a study was conducted with 16 +participants who interacted with two VR scenes, each with varying design +elements such as style, color, texture, object, and background, while watching +a short tutorial. Participant rankings of the scenes for learning were obtained +using a seven-point Likert scale, and the Mann-Whitney U test was used to +validate differences in preference between the scenes. The results showed a +significant difference in preference between the scenes. Further analysis using +the NASA TLX questionnaire was conducted to examine the impact of this +difference on cognitive load, and participant feedback was also considered. The +study emphasizes the importance of careful VR scene design to improve the +user's learning experience. + +
+
+ comment: 6 pages,3 figures, accepted to present at IEEE 42nd International + Conference on Consumer Electronics +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 57 + +
+
+
+ + ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient + Language Model Finetuning + + +
+ We propose a simple approach for memory-efficient adaptation of pretrained +language models. Our approach uses an iterative algorithm to decompose each +pretrained matrix into a high-precision low-rank component and a +memory-efficient quantized component. During finetuning, the quantized +component remains fixed and only the low-rank component is updated. We present +an integer linear programming formulation of the quantization component which +enables dynamic configuration of quantization parameters (e.g., bit-width, +block size) for each matrix given an overall target memory budget. We further +explore a data-aware version of the algorithm which uses an approximation of +the Fisher information matrix to weight the reconstruction objective during +matrix decomposition. Experiments on adapting RoBERTa and LLaMA-2 (7B and 70B) +demonstrate that our low-rank plus quantized matrix decomposition approach +(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and moreover enables +more aggressive quantization. For example, on the OpenAssistant benchmark +LQ-LoRA is able to learn a 2.5-bit LLaMA-2 model that is competitive with a +model finetuned with 4-bit QLoRA. When finetuned on a language modeling +calibration dataset, LQ-LoRA can also be used for model compression; in this +setting our 2.75-bit LLaMA-2-70B model (which has 2.85 bits on average when +including the low-rank components and requires 27GB of GPU memory) is +competitive with the original model in full precision. + +
+
+
+
+
+ + ☆ GPQA: A Graduate-Level Google-Proof Q&A Benchmark + + +
+ We present GPQA, a challenging dataset of 448 multiple-choice questions +written by domain experts in biology, physics, and chemistry. We ensure that +the questions are high-quality and extremely difficult: experts who have or are +pursuing PhDs in the corresponding domains reach 65% accuracy (74% when +discounting clear mistakes the experts identified in retrospect), while highly +skilled non-expert validators only reach 34% accuracy, despite spending on +average over 30 minutes with unrestricted access to the web (i.e., the +questions are "Google-proof"). The questions are also difficult for +state-of-the-art AI systems, with our strongest GPT-4 based baseline achieving +39% accuracy. If we are to use future AI systems to help us answer very hard +questions, for example, when developing new scientific knowledge, we need to +develop scalable oversight methods that enable humans to supervise their +outputs, which may be difficult even if the supervisors are themselves skilled +and knowledgeable. The difficulty of GPQA both for skilled non-experts and +frontier AI systems should enable realistic scalable oversight experiments, +which we hope can help devise ways for human experts to reliably get truthful +information from AI systems that surpass human capabilities. + +
+
+ comment: 28 pages, 5 figures, 7 tables +
+
+
+
+
+ + ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), by integrating observations of human actions to +facilitate robotic manipulation. This system analyzes videos of humans +performing tasks and creates executable robot programs that incorporate +affordance insights. The computation starts by analyzing the videos with GPT-4V +to convert environmental and action details into text, followed by a +GPT-4-empowered task planner. In the following analyses, vision systems +reanalyze the video with the task plan. Object names are grounded using an +open-vocabulary object detector, while focus on the hand-object relation helps +to detect the moment of grasping and releasing. This spatiotemporal grounding +allows the vision systems to further gather affordance data (e.g., grasp type, +way points, and body postures). Experiments across various scenarios +demonstrate this method's efficacy in achieving real robots' operations from +human demonstrations in a zero-shot manner. The prompts of GPT-4V/GPT-4 are +available at this project page: +https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/ + +
+
+ comment: 8 pages, 10 figures, 1 table. Last updated on November 20th, 2023 +
+
+
+
+
+ + ☆ H-COAL: Human Correction of AI-Generated Labels for Biomedical Named + Entity Recognition + + +
+ With the rapid advancement of machine learning models for NLP tasks, +collecting high-fidelity labels from AI models is a realistic possibility. +Firms now make AI available to customers via predictions as a service (PaaS). +This includes PaaS products for healthcare. It is unclear whether these labels +can be used for training a local model without expensive annotation checking by +in-house experts. In this work, we propose a new framework for Human Correction +of AI-Generated Labels (H-COAL). By ranking AI-generated outputs, one can +selectively correct labels and approach gold standard performance (100% human +labeling) with significantly less human effort. We show that correcting 5% of +labels can close the AI-human performance gap by up to 64% relative +improvement, and correcting 20% of labels can close the performance gap by up +to 86% relative improvement. + +
+
+ comment: Presented at Conference on Information Systems and Technology (CIST) + 2023 +
+
+
+
+
+ + ☆ On the Potential and Limitations of Few-Shot In-Context Learning to + Generate Metamorphic Specifications for Tax Preparation Software EMNLP 2023 + + +
+ Due to the ever-increasing complexity of income tax laws in the United +States, the number of US taxpayers filing their taxes using tax preparation +software (henceforth, tax software) continues to increase. According to the +U.S. Internal Revenue Service (IRS), in FY22, nearly 50% of taxpayers filed +their individual income taxes using tax software. Given the legal consequences +of incorrectly filing taxes for the taxpayer, ensuring the correctness of tax +software is of paramount importance. Metamorphic testing has emerged as a +leading solution to test and debug legal-critical tax software due to the +absence of correctness requirements and trustworthy datasets. The key idea +behind metamorphic testing is to express the properties of a system in terms of +the relationship between one input and its slightly metamorphosed twinned +input. Extracting metamorphic properties from IRS tax publications is a tedious +and time-consuming process. As a response, this paper formulates the task of +generating metamorphic specifications as a translation task between properties +extracted from tax documents - expressed in natural language - to a contrastive +first-order logic form. We perform a systematic analysis on the potential and +limitations of in-context learning with Large Language Models(LLMs) for this +task, and outline a research agenda towards automating the generation of +metamorphic specifications for tax preparation software. + +
+
+ comment: Accepted to the Proceedings of the Natural Legal Language Processing + Workshop, EMNLP 2023 +
+
+
+
+
+ + ☆ Context-aware Neural Machine Translation for English-Japanese Business + Scene Dialogues + + +
+ Despite the remarkable advancements in machine translation, the current +sentence-level paradigm faces challenges when dealing with highly-contextual +languages like Japanese. In this paper, we explore how context-awareness can +improve the performance of the current Neural Machine Translation (NMT) models +for English-Japanese business dialogues translation, and what kind of context +provides meaningful information to improve translation. As business dialogue +involves complex discourse phenomena but offers scarce training resources, we +adapted a pretrained mBART model, finetuning on multi-sentence dialogue data, +which allows us to experiment with different contexts. We investigate the +impact of larger context sizes and propose novel context tokens encoding +extra-sentential information, such as speaker turn and scene type. We make use +of Conditional Cross-Mutual Information (CXMI) to explore how much of the +context the model uses and generalise CXMI to study the impact of the +extra-sentential context. Overall, we find that models leverage both preceding +sentences and extra-sentential context (with CXMI increasing with context size) +and we provide a more focused analysis on honorifics translation. Regarding +translation quality, increased source-side context paired with scene and +speaker information improves the model performance compared to previous work +and our context-agnostic baselines, measured in BLEU and COMET metrics. + +
+
+ comment: MT Summit 2023, research track, link to paper in proceedings: + https://aclanthology.org/2023.mtsummit-research.23/ +
+
+
+
+
+ + ☆ Adaptive Training Distributions with Scalable Online Bilevel + Optimization + + +
+ Large neural networks pretrained on web-scale corpora are central to modern +machine learning. In this paradigm, the distribution of the large, +heterogeneous pretraining data rarely matches that of the application domain. +This work considers modifying the pretraining distribution in the case where +one has a small sample of data reflecting the targeted test conditions. We +propose an algorithm motivated by a recent formulation of this setting as an +online, bilevel optimization problem. With scalability in mind, our algorithm +prioritizes computing gradients at training points which are likely to most +improve the loss on the targeted distribution. Empirically, we show that in +some cases this approach is beneficial over existing strategies from the domain +adaptation literature but may not succeed in other cases. We propose a simple +test to evaluate when our approach can be expected to work well and point +towards further research to address current limitations. + +
+
+
+
+
+ + ☆ Automatic Analysis of Substantiation in Scientific Peer Reviews EMNLP 2023 + + +
+ With the increasing amount of problematic peer reviews in top AI conferences, +the community is urgently in need of automatic quality control measures. In +this paper, we restrict our attention to substantiation -- one popular quality +aspect indicating whether the claims in a review are sufficiently supported by +evidence -- and provide a solution automatizing this evaluation process. To +achieve this goal, we first formulate the problem as claim-evidence pair +extraction in scientific peer reviews, and collect SubstanReview, the first +annotated dataset for this task. SubstanReview consists of 550 reviews from NLP +conferences annotated by domain experts. On the basis of this dataset, we train +an argument mining system to automatically analyze the level of substantiation +in peer reviews. We also perform data analysis on the SubstanReview dataset to +obtain meaningful insights on peer reviewing quality in NLP conferences over +recent years. + +
+
+ comment: Accepted to EMNLP 2023 Findings +
+
+
+
+
+ + ☆ FinanceBench: A New Benchmark for Financial Question Answering + + +
+ FinanceBench is a first-of-its-kind test suite for evaluating the performance +of LLMs on open book financial question answering (QA). It comprises 10,231 +questions about publicly traded companies, with corresponding answers and +evidence strings. The questions in FinanceBench are ecologically valid and +cover a diverse set of scenarios. They are intended to be clear-cut and +straightforward to answer to serve as a minimum performance standard. We test +16 state of the art model configurations (including GPT-4-Turbo, Llama2 and +Claude2, with vector stores and long context prompts) on a sample of 150 cases +from FinanceBench, and manually review their answers (n=2,400). The cases are +available open-source. We show that existing LLMs have clear limitations for +financial QA. Notably, GPT-4-Turbo used with a retrieval system incorrectly +answered or refused to answer 81% of questions. While augmentation techniques +such as using longer context window to feed in relevant evidence improve +performance, they are unrealistic for enterprise settings due to increased +latency and cannot support larger financial documents. We find that all models +examined exhibit weaknesses, such as hallucinations, that limit their +suitability for use by enterprises. + +
+
+ comment: Dataset is available at: + https://huggingface.co/datasets/PatronusAI/financebench +
+
+
+
+
+ + ☆ LLMs as Visual Explainers: Advancing Image Classification with Evolving + Visual Descriptions + + +
+ Vision-language models (VLMs) offer a promising paradigm for image +classification by comparing the similarity between images and class embeddings. +A critical challenge lies in crafting precise textual representations for class +names. While previous studies have leveraged recent advancements in large +language models (LLMs) to enhance these descriptors, their outputs often suffer +from ambiguity and inaccuracy. We identify two primary causes: 1) The prevalent +reliance on textual interactions with LLMs, leading to a mismatch between the +generated text and the visual content in VLMs' latent space - a phenomenon we +term the "explain without seeing" dilemma. 2) The oversight of the inter-class +relationships, resulting in descriptors that fail to differentiate similar +classes effectively. To address these issues, we propose a novel image +classification framework combining VLMs with LLMs, named Iterative Optimization +with Visual Feedback. In particular, our method develops an LLM-based agent, +employing an evolutionary optimization strategy to refine class descriptors. +Crucially, we incorporate visual feedback from VLM classification metrics, +thereby guiding the optimization process with concrete visual data. Our method +leads to improving accuracy on a wide range of image classification benchmarks, +with 3.47\% average gains over state-of-the-art methods. We also highlight the +resulting descriptions serve as explainable and robust features that can +consistently improve the performance across various backbone models. + +
+
+
+
+
+ + ☆ Generating Valid and Natural Adversarial Examples with Large Language + Models + + +
+ Deep learning-based natural language processing (NLP) models, particularly +pre-trained language models (PLMs), have been revealed to be vulnerable to +adversarial attacks. However, the adversarial examples generated by many +mainstream word-level adversarial attack models are neither valid nor natural, +leading to the loss of semantic maintenance, grammaticality, and human +imperceptibility. Based on the exceptional capacity of language understanding +and generation of large language models (LLMs), we propose LLM-Attack, which +aims at generating both valid and natural adversarial examples with LLMs. The +method consists of two stages: word importance ranking (which searches for the +most vulnerable words) and word synonym replacement (which substitutes them +with their synonyms obtained from LLMs). Experimental results on the Movie +Review (MR), IMDB, and Yelp Review Polarity datasets against the baseline +adversarial attack models illustrate the effectiveness of LLM-Attack, and it +outperforms the baselines in human and GPT-4 evaluation by a significant +margin. The model can generate adversarial examples that are typically valid +and natural, with the preservation of semantic meaning, grammaticality, and +human imperceptibility. + +
+
+ comment: Submitted to the IEEE for possible publication +
+
+
+
+
+ + ☆ Evil Geniuses: Delving into the Safety of LLM-based Agents + + +
+ The rapid advancements in large language models (LLMs) have led to a +resurgence in LLM-based agents, which demonstrate impressive human-like +behaviors and cooperative capabilities in various interactions and strategy +formulations. However, evaluating the safety of LLM-based agents remains a +complex challenge. This paper elaborately conducts a series of manual jailbreak +prompts along with a virtual chat-powered evil plan development team, dubbed +Evil Geniuses, to thoroughly probe the safety aspects of these agents. Our +investigation reveals three notable phenomena: 1) LLM-based agents exhibit +reduced robustness against malicious attacks. 2) the attacked agents could +provide more nuanced responses. 3) the detection of the produced improper +responses is more challenging. These insights prompt us to question the +effectiveness of LLM-based attacks on agents, highlighting vulnerabilities at +various levels and within different role specializations within the +system/agent of LLM-based agents. Extensive evaluation and discussion reveal +that LLM-based agents face significant challenges in safety and yield insights +for future research. Our code is available at +https://github.com/T1aNS1R/Evil-Geniuses. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Deepparse : An Extendable, and Fine-Tunable State-Of-The-Art Library for + Parsing Multinational Street Addresses EMNLP 2024 + + +
+ Segmenting an address into meaningful components, also known as address +parsing, is an essential step in many applications from record linkage to +geocoding and package delivery. Consequently, a lot of work has been dedicated +to develop accurate address parsing techniques, with machine learning and +neural network methods leading the state-of-the-art scoreboard. However, most +of the work on address parsing has been confined to academic endeavours with +little availability of free and easy-to-use open-source solutions. + This paper presents Deepparse, a Python open-source, extendable, fine-tunable +address parsing solution under LGPL-3.0 licence to parse multinational +addresses using state-of-the-art deep learning algorithms and evaluated on over +60 countries. It can parse addresses written in any language and use any +address standard. The pre-trained model achieves average $99~\%$ parsing +accuracies on the countries used for training with no pre-processing nor +post-processing needed. Moreover, the library supports fine-tuning with new +data to generate a custom address parser. + +
+
+ comment: Accepted in EMNLP 2024 NLP-OSS workshop. arXiv admin note: text + overlap with arXiv:2006.16152, arXiv:2112.04008 +
+
+
+
+
+ + ☆ How to Use Large Language Models for Text Coding: The Case of Fatherhood + Roles in Public Policy Documents + + +
+ Recent advances in large language models (LLMs) like GPT-3 and GPT-4 have +opened up new opportunities for text analysis in political science. They +promise automation with better results and less programming. In this study, we +evaluate LLMs on three original coding tasks of non-English political science +texts, and we provide a detailed description of a general workflow for using +LLMs for text coding in political science research. Our use case offers a +practical guide for researchers looking to incorporate LLMs into their research +on text analysis. We find that, when provided with detailed label definitions +and coding examples, an LLM can be as good as or even better than a human +annotator while being much faster (up to hundreds of times), considerably +cheaper (costing up to 60% less than human coding), and much easier to scale to +large amounts of text. Overall, LLMs present a viable option for most text +coding projects. + +
+
+
+
+
+ + ☆ System 2 Attention (is something you might need too) + + +
+ Soft attention in Transformer-based Large Language Models (LLMs) is +susceptible to incorporating irrelevant information from the context into its +latent representations, which adversely affects next token generations. To help +rectify these issues, we introduce System 2 Attention (S2A), which leverages +the ability of LLMs to reason in natural language and follow instructions in +order to decide what to attend to. S2A regenerates the input context to only +include the relevant portions, before attending to the regenerated context to +elicit the final response. In experiments, S2A outperforms standard +attention-based LLMs on three tasks containing opinion or irrelevant +information, QA, math word problems and longform generation, where S2A +increases factuality and objectivity, and decreases sycophancy. + +
+
+
+
+
+ + ☆ Efficient Grammatical Error Correction Via Multi-Task Training and + Optimized Training Schedule EMNLP 2023 + + +
+ Progress in neural grammatical error correction (GEC) is hindered by the lack +of annotated training data. Sufficient amounts of high-quality manually +annotated data are not available, so recent research has relied on generating +synthetic data, pretraining on it, and then fine-tuning on real datasets; +performance gains have been achieved either by ensembling or by using huge +pretrained models such as XXL-T5 as the backbone. In this work, we explore an +orthogonal direction: how to use available data more efficiently. First, we +propose auxiliary tasks that exploit the alignment between the original and +corrected sentences, such as predicting a sequence of corrections. We formulate +each task as a sequence-to-sequence problem and perform multi-task training. +Second, we discover that the order of datasets used for training and even +individual instances within a dataset may have important effects on the final +performance, so we set out to find the best training schedule. Together, these +two ideas lead to significant improvements, producing results that improve +state of the art with much smaller models; in particular, we outperform the +best models based on T5-XXL (11B parameters) with a BART-based model (400M +parameters). + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ Igniting Language Intelligence: The Hitchhiker's Guide From + Chain-of-Thought Reasoning to Language Agents + + +
+ Large language models (LLMs) have dramatically enhanced the field of language +intelligence, as demonstrably evidenced by their formidable empirical +performance across a spectrum of complex reasoning tasks. Additionally, +theoretical proofs have illuminated their emergent reasoning capabilities, +providing a compelling showcase of their advanced cognitive abilities in +linguistic contexts. Critical to their remarkable efficacy in handling complex +reasoning tasks, LLMs leverage the intriguing chain-of-thought (CoT) reasoning +techniques, obliging them to formulate intermediate steps en route to deriving +an answer. The CoT reasoning approach has not only exhibited proficiency in +amplifying reasoning performance but also in enhancing interpretability, +controllability, and flexibility. In light of these merits, recent research +endeavors have extended CoT reasoning methodologies to nurture the development +of autonomous language agents, which adeptly adhere to language instructions +and execute actions within varied environments. This survey paper orchestrates +a thorough discourse, penetrating vital research dimensions, encompassing: (i) +the foundational mechanics of CoT techniques, with a focus on elucidating the +circumstances and justification behind its efficacy; (ii) the paradigm shift in +CoT; and (iii) the burgeoning of language agents fortified by CoT approaches. +Prospective research avenues envelop explorations into generalization, +efficiency, customization, scaling, and safety. This paper caters to a wide +audience, including beginners seeking comprehensive knowledge of CoT reasoning +and language agents, as well as experienced researchers interested in +foundational mechanics and engaging in cutting-edge discussions on these +topics. A repository for the related papers is available at +https://github.com/Zoeyyao27/CoT-Igniting-Agent. + +
+
+
+
+
+ + ☆ Beyond Boundaries: A Comprehensive Survey of Transferable Attacks on AI + Systems + + +
+ Artificial Intelligence (AI) systems such as autonomous vehicles, facial +recognition, and speech recognition systems are increasingly integrated into +our daily lives. However, despite their utility, these AI systems are +vulnerable to a wide range of attacks such as adversarial, backdoor, data +poisoning, membership inference, model inversion, and model stealing attacks. +In particular, numerous attacks are designed to target a particular model or +system, yet their effects can spread to additional targets, referred to as +transferable attacks. Although considerable efforts have been directed toward +developing transferable attacks, a holistic understanding of the advancements +in transferable attacks remains elusive. In this paper, we comprehensively +explore learning-based attacks from the perspective of transferability, +particularly within the context of cyber-physical security. We delve into +different domains -- the image, text, graph, audio, and video domains -- to +highlight the ubiquitous and pervasive nature of transferable attacks. This +paper categorizes and reviews the architecture of existing attacks from various +viewpoints: data, process, model, and system. We further examine the +implications of transferable attacks in practical scenarios such as autonomous +driving, speech recognition, and large language models (LLMs). Additionally, we +outline the potential research directions to encourage efforts in exploring the +landscape of transferable attacks. This survey offers a holistic understanding +of the prevailing transferable attacks and their impacts across different +domains. + +
+
+
+
+
+ + ☆ Encoding Speaker-Specific Latent Speech Feature for Speech Synthesis + + +
+ In this work, we propose a novel method for modeling numerous speakers, which +enables expressing the overall characteristics of speakers in detail like a +trained multi-speaker model without additional training on the target speaker's +dataset. Although various works with similar purposes have been actively +studied, their performance has not yet reached that of trained multi-speaker +models due to their fundamental limitations. To overcome previous limitations, +we propose effective methods for feature learning and representing target +speakers' speech characteristics by discretizing the features and conditioning +them to a speech synthesis model. Our method obtained a significantly higher +similarity mean opinion score (SMOS) in subjective similarity evaluation than +seen speakers of a best-performing multi-speaker model, even with unseen +speakers. The proposed method also outperforms a zero-shot method by +significant margins. Furthermore, our method shows remarkable performance in +generating new artificial speakers. In addition, we demonstrate that the +encoded latent features are sufficiently informative to reconstruct an original +speaker's speech completely. It implies that our method can be used as a +general methodology to encode and reconstruct speakers' characteristics in +various tasks. + +
+
+
+
+
+ + ☆ Control in Hybrid Chatbots + + +
+ Customer data typically is held in database systems, which can be seen as +rule-based knowledge base, whereas businesses increasingly want to benefit from +the capabilities of large, pre-trained language models. + In this technical report, we describe a case study of how a commercial rule +engine and an integrated neural chatbot may be integrated, and what level of +control that particular integration mode leads to. We also discuss alternative +ways (including past ways realized in other systems) how researchers strive to +maintain control and avoid what has recently been called model "hallucination". + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Sparse Low-rank Adaptation of Pre-trained Language Models EMNLP 2023 + + +
+ Fine-tuning pre-trained large language models in a parameter-efficient manner +is widely studied for its effectiveness and efficiency. The popular method of +low-rank adaptation (LoRA) offers a notable approach, hypothesizing that the +adaptation process is intrinsically low-dimensional. Although LoRA has +demonstrated commendable performance, it is implemented with a fixed and +unalterable intrinsic rank that might not always be the ideal choice. +Recognizing the need for more flexible adaptation, we extend the methodology of +LoRA to an innovative approach we call sparse low-rank adaptation (SoRA) that +enables dynamic adjustments to the intrinsic rank during the adaptation +process. We achieve this through the incorporation of a gate unit optimized +with proximal gradient method in the training stage, controlling the +cardinality of rank under the sparsity of the gate. In the subsequent inference +stage, we eliminate the parameter blocks corresponding to the zeroed-out ranks, +to reduce each SoRA module back to a concise yet rank-optimal LoRA. Our +approach strengthens the representation power of LoRA by initializing it with a +higher rank, while efficiently taming a temporarily increased number of +parameters via updating in a sparse way. We further introduce a sparsifying +scheduler for SoRA, aiming to examine the impact of the number of non-zero +parameters on the model's memorization and generalization. Our experimental +results demonstrate that SoRA can outperform other baselines even with 70% +retained parameters and 70% training time. + +
+
+ comment: Accepted to EMNLP 2023 (Main Conference) +
+
+
+
+
+ + ☆ Refactoring Programs Using Large Language Models with Few-Shot Examples + + +
+ A less complex and more straightforward program is a crucial factor that +enhances its maintainability and makes writing secure and bug-free programs +easier. However, due to its heavy workload and the risks of breaking the +working programs, programmers are reluctant to do code refactoring, and thus, +it also causes the loss of potential learning experiences. To mitigate this, we +demonstrate the application of using a large language model (LLM), GPT-3.5, to +suggest less complex versions of the user-written Python program, aiming to +encourage users to learn how to write better programs. We propose a method to +leverage the prompting with few-shot examples of the LLM by selecting the +best-suited code refactoring examples for each target programming problem based +on the prior evaluation of prompting with the one-shot example. The +quantitative evaluation shows that 95.68% of programs can be refactored by +generating 10 candidates each, resulting in a 17.35% reduction in the average +cyclomatic complexity and a 25.84% decrease in the average number of lines +after filtering only generated programs that are semantically correct. +Furthermore, the qualitative evaluation shows outstanding capability in code +formatting, while unnecessary behaviors such as deleting or translating +comments are also observed. + +
+
+ comment: 10 pages, 10 figures, accepted to the 30th Asia-Pacific Software + Engineering Conference (APSEC 2023) +
+
+
+
+
+ + ☆ Taiyi: A Bilingual Fine-Tuned Large Language Model for Diverse + Biomedical Tasks + + +
+ Recent advancements in large language models (LLMs) have shown promising +results across a variety of natural language processing (NLP) tasks. The +application of LLMs to specific domains, such as biomedicine, has achieved +increased attention. However, most biomedical LLMs focus on enhancing +performance in monolingual biomedical question answering and conversation +tasks. To further investigate the effectiveness of the LLMs on diverse +biomedical NLP tasks in different languages, we present Taiyi, a bilingual +(English and Chinese) fine-tuned LLM for diverse biomedical tasks. In this +work, we first curated a comprehensive collection of 140 existing biomedical +text mining datasets across over 10 task types. Subsequently, a two-stage +strategy is proposed for supervised fine-tuning to optimize the model +performance across varied tasks. Experimental results on 13 test sets covering +named entity recognition, relation extraction, text classification, question +answering tasks demonstrate Taiyi achieves superior performance compared to +general LLMs. The case study involving additional biomedical NLP tasks further +shows Taiyi's considerable potential for bilingual biomedical multi-tasking. +The source code, datasets, and model for Taiyi are freely available at +https://github.com/DUTIR-BioNLP/Taiyi-LLM. + +
+
+
+
+
+ + ☆ Addressing the Length Bias Problem in Document-Level Neural Machine + Translation EMNLP2023 + + +
+ Document-level neural machine translation (DNMT) has shown promising results +by incorporating more context information. However, this approach also +introduces a length bias problem, whereby DNMT suffers from significant +translation quality degradation when decoding documents that are much shorter +or longer than the maximum sequence length during training. %i.e., the length +bias problem. To solve the length bias problem, we propose to improve the DNMT +model in training method, attention mechanism, and decoding strategy. Firstly, +we propose to sample the training data dynamically to ensure a more uniform +distribution across different sequence lengths. Then, we introduce a +length-normalized attention mechanism to aid the model in focusing on target +information, mitigating the issue of attention divergence when processing +longer sequences. Lastly, we propose a sliding window strategy during decoding +that integrates as much context information as possible without exceeding the +maximum sequence length. The experimental results indicate that our method can +bring significant improvements on several open datasets, and further analysis +shows that our method can significantly alleviate the length bias problem. + +
+
+ comment: Accepted by EMNLP2023 Findings +
+
+
+
+
+ + ☆ Filling the Image Information Gap for VQA: Prompting Large Language + Models to Proactively Ask Questions EMNLP2023 + + +
+ Large Language Models (LLMs) demonstrate impressive reasoning ability and the +maintenance of world knowledge not only in natural language tasks, but also in +some vision-language tasks such as open-domain knowledge-based visual question +answering (OK-VQA). As images are invisible to LLMs, researchers convert images +to text to engage LLMs into the visual question reasoning procedure. This leads +to discrepancies between images and their textual representations presented to +LLMs, which consequently impedes final reasoning performance. To fill the +information gap and better leverage the reasoning capability, we design a +framework that enables LLMs to proactively ask relevant questions to unveil +more details in the image, along with filters for refining the generated +information. We validate our idea on OK-VQA and A-OKVQA. Our method +continuously boosts the performance of baselines methods by an average gain of +2.15% on OK-VQA, and achieves consistent improvements across different LLMs. + +
+
+ comment: Accepted to EMNLP2023 Findings +
+
+
+
+
+ + ☆ How well ChatGPT understand Malaysian English? An Evaluation on Named + Entity Recognition and Relation Extraction EMNLP + 2023 + + +
+ Recently, ChatGPT has attracted a lot of interest from both researchers and +the general public. While the performance of ChatGPT in named entity +recognition and relation extraction from Standard English texts is +satisfactory, it remains to be seen if it can perform similarly for Malaysian +English. Malaysian English is unique as it exhibits morphosyntactic and +semantical adaptation from local contexts. In this study, we assess ChatGPT's +capability in extracting entities and relations from the Malaysian English News +(MEN) dataset. We propose a three-step methodology referred to as +\textbf{\textit{educate-predict-evaluate}}. The performance of ChatGPT is +assessed using F1-Score across 18 unique prompt settings, which were carefully +engineered for a comprehensive review. From our evaluation, we found that +ChatGPT does not perform well in extracting entities from Malaysian English +news articles, with the highest F1-Score of 0.497. Further analysis shows that +the morphosyntactic adaptation in Malaysian English caused the limitation. +However, interestingly, this morphosyntactic adaptation does not impact the +performance of ChatGPT for relation extraction. + +
+
+ comment: Accepted in Generation, Evaluation & Metrics (GEM) Workshop at EMNLP + 2023 +
+
+
+
+
+ + ☆ KBioXLM: A Knowledge-anchored Biomedical Multilingual Pretrained + Language Model + + +
+ Most biomedical pretrained language models are monolingual and cannot handle +the growing cross-lingual requirements. The scarcity of non-English domain +corpora, not to mention parallel data, poses a significant hurdle in training +multilingual biomedical models. Since knowledge forms the core of +domain-specific corpora and can be translated into various languages +accurately, we propose a model called KBioXLM, which transforms the +multilingual pretrained model XLM-R into the biomedical domain using a +knowledge-anchored approach. We achieve a biomedical multilingual corpus by +incorporating three granularity knowledge alignments (entity, fact, and passage +levels) into monolingual corpora. Then we design three corresponding training +tasks (entity masking, relation masking, and passage relation prediction) and +continue training on top of the XLM-R model to enhance its domain cross-lingual +ability. To validate the effectiveness of our model, we translate the English +benchmarks of multiple tasks into Chinese. Experimental results demonstrate +that our model significantly outperforms monolingual and multilingual +pretrained models in cross-lingual zero-shot and few-shot scenarios, achieving +improvements of up to 10+ points. Our code is publicly available at +https://github.com/ngwlh-gl/KBioXLM. + +
+
+
+
+
+ + ☆ Exploring Prompting Large Language Models as Explainable Metrics + + +
+ This paper describes the IUST NLP Lab submission to the Prompting Large +Language Models as Explainable Metrics Shared Task at the Eval4NLP 2023 +Workshop on Evaluation & Comparison of NLP Systems. We have proposed a +zero-shot prompt-based strategy for explainable evaluation of the summarization +task using Large Language Models (LLMs). The conducted experiments demonstrate +the promising potential of LLMs as evaluation metrics in Natural Language +Processing (NLP), particularly in the field of summarization. Both few-shot and +zero-shot approaches are employed in these experiments. The performance of our +best provided prompts achieved a Kendall correlation of 0.477 with human +evaluations in the text summarization task on the test data. Code and results +are publicly available on GitHub. + +
+
+ comment: 9 pages, Eval4NLP 2023 +
+
+
+
+
+ + ☆ Adapt in Contexts: Retrieval-Augmented Domain Adaptation via In-Context + Learning EMNLP 2023 + + +
+ Large language models (LLMs) have showcased their capability with few-shot +inference known as in-context learning. However, in-domain demonstrations are +not always readily available in real scenarios, leading to cross-domain +in-context learning. Besides, LLMs are still facing challenges in long-tail +knowledge in unseen and unfamiliar domains. The above limitations demonstrate +the necessity of Unsupervised Domain Adaptation (UDA). In this paper, we study +the UDA problem under an in-context learning setting to adapt language models +from the source domain to the target domain without any target labels. The core +idea is to retrieve a subset of cross-domain elements that are the most similar +to the query, and elicit language model to adapt in an in-context manner by +learning both target domain distribution and the discriminative task signal +simultaneously with the augmented cross-domain in-context examples. We devise +different prompting and training strategies, accounting for different LM +architectures to learn the target distribution via language modeling. With +extensive experiments on Sentiment Analysis (SA) and Named Entity Recognition +(NER) tasks, we thoroughly study the effectiveness of ICL for domain transfer +and demonstrate significant improvements over baseline models. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ Multi-teacher Distillation for Multilingual Spelling Correction + + +
+ Accurate spelling correction is a critical step in modern search interfaces, +especially in an era of mobile devices and speech-to-text interfaces. For +services that are deployed around the world, this poses a significant challenge +for multilingual NLP: spelling errors need to be caught and corrected in all +languages, and even in queries that use multiple languages. In this paper, we +tackle this challenge using multi-teacher distillation. On our approach, a +monolingual teacher model is trained for each language/locale, and these +individual models are distilled into a single multilingual student model +intended to serve all languages/locales. In experiments using open-source data +as well as user data from a worldwide search service, we show that this leads +to highly effective spelling correction models that can meet the tight latency +requirements of deployed services. + +
+
+
+
+
+ + ☆ GPT in Data Science: A Practical Exploration of Model Selection + + +
+ There is an increasing interest in leveraging Large Language Models (LLMs) +for managing structured data and enhancing data science processes. Despite the +potential benefits, this integration poses significant questions regarding +their reliability and decision-making methodologies. It highlights the +importance of various factors in the model selection process, including the +nature of the data, problem type, performance metrics, computational resources, +interpretability vs accuracy, assumptions about data, and ethical +considerations. Our objective is to elucidate and express the factors and +assumptions guiding GPT-4's model selection recommendations. We employ a +variability model to depict these factors and use toy datasets to evaluate both +the model and the implementation of the identified heuristics. By contrasting +these outcomes with heuristics from other platforms, our aim is to determine +the effectiveness and distinctiveness of GPT-4's methodology. This research is +committed to advancing our comprehension of AI decision-making processes, +especially in the realm of model selection within data science. Our efforts are +directed towards creating AI systems that are more transparent and +comprehensible, contributing to a more responsible and efficient practice in +data science. + +
+
+ comment: 11 pages. To appear in IEEE BigData 2023 +
+
+
+
+
+ + ☆ Token-Level Adversarial Prompt Detection Based on Perplexity Measures + and Contextual Information + + +
+ In recent years, Large Language Models (LLM) have emerged as pivotal tools in +various applications. However, these models are susceptible to adversarial +prompt attacks, where attackers can carefully curate input strings that lead to +undesirable outputs. The inherent vulnerability of LLMs stems from their +input-output mechanisms, especially when presented with intensely +out-of-distribution (OOD) inputs. This paper proposes a token-level detection +method to identify adversarial prompts, leveraging the LLM's capability to +predict the next token's probability. We measure the degree of the model's +perplexity and incorporate neighboring token information to encourage the +detection of contiguous adversarial prompt sequences. As a result, we propose +two methods: one that identifies each token as either being part of an +adversarial prompt or not, and another that estimates the probability of each +token being part of an adversarial prompt. + +
+
+
+
+
+ + ☆ Meta Prompting for AGI Systems + + +
+ This paper presents an in-depth exploration of Meta Prompting, a novel +technique that revolutionizes the way large language models (LLMs), multi-modal +foundation models, and AI systems approach problem-solving and data +interpretation. Meta Prompting, rooted in type theory and category theory, +prioritizes the structure and syntax of information, providing a unique +framework that transcends traditional content-focused methods. We delve into +the formal definitions of Meta Prompting, contrasting it with Few-Shot +Prompting, and highlight its applicability and superiority in various AI +applications. + Key to this exploration is the expansion of Meta Prompting into the realm of +complex reasoning. Here, we demonstrate how this technique adeptly breaks down +intricate problems into manageable sub-problems, facilitating a step-by-step, +detailed approach to problem-solving. This method proves especially +advantageous in terms of token efficiency and offering a fair comparison in +problem-solving scenarios, standing out against few-shot example approaches. + Furthermore, the paper breaks new ground by extending Meta Prompting into +multi-modal foundation model settings. This extension addresses the integration +of diverse data types, such as images, audio, and video, within the structured +framework of Meta Prompting, highlighting both the challenges and the vast +potential of this approach in handling complex, multi-faceted data (The code is +available at https://github.com/meta-prompting/meta-prompting). + +
+
+
+
+
+ + ☆ What's left can't be right -- The remaining positional incompetence of + contrastive vision-language models + + +
+ Contrastive vision-language models like CLIP have been found to lack spatial +understanding capabilities. In this paper we discuss the possible causes of +this phenomenon by analysing both datasets and embedding space. By focusing on +simple left-right positional relations, we show that this behaviour is entirely +predictable, even with large-scale datasets, demonstrate that these relations +can be taught using synthetic data and show that this approach can generalise +well to natural images - improving the performance on left-right relations on +Visual Genome Relations. + +
+
+
+
+
+ + ☆ Unifying Corroborative and Contributive Attributions in Large Language + Models NeurIPS + + +
+ As businesses, products, and services spring up around large language models, +the trustworthiness of these models hinges on the verifiability of their +outputs. However, methods for explaining language model outputs largely fall +across two distinct fields of study which both use the term "attribution" to +refer to entirely separate techniques: citation generation and training data +attribution. In many modern applications, such as legal document generation and +medical question answering, both types of attributions are important. In this +work, we argue for and present a unified framework of large language model +attributions. We show how existing methods of different types of attribution +fall under the unified framework. We also use the framework to discuss +real-world use cases where one or both types of attributions are required. We +believe that this unified framework will guide the use case driven development +of systems that leverage both types of attribution, as well as the +standardization of their evaluation. + +
+
+ comment: NeurIPS ATTRIB Workshop 2023 +
+
+
+
+
+ + ☆ Leveraging Closed-Access Multilingual Embedding for Automatic Sentence + Alignment in Low Resource Languages + + +
+ The importance of qualitative parallel data in machine translation has long +been determined but it has always been very difficult to obtain such in +sufficient quantity for the majority of world languages, mainly because of the +associated cost and also the lack of accessibility to these languages. Despite +the potential for obtaining parallel datasets from online articles using +automatic approaches, forensic investigations have found a lot of +quality-related issues such as misalignment, and wrong language codes. In this +work, we present a simple but qualitative parallel sentence aligner that +carefully leveraged the closed-access Cohere multilingual embedding, a solution +that ranked second in the just concluded #CoHereAIHack 2023 Challenge (see +https://ai6lagos.devpost.com). The proposed approach achieved $94.96$ and +$54.83$ f1 scores on FLORES and MAFAND-MT, compared to $3.64$ and $0.64$ of +LASER respectively. Our method also achieved an improvement of more than 5 BLEU +scores over LASER, when the resulting datasets were used with MAFAND-MT dataset +to train translation models. Our code and data are available for research +purposes here (https://github.com/abumafrim/Cohere-Align). + +
+
+ comment: To appear in the proceedings of ICCAIT 2023. 6 pages, 2 figures +
+
+
+
+
+ + ☆ Human Learning by Model Feedback: The Dynamics of Iterative Prompting + with Midjourney EMNLP23 + + +
+ Generating images with a Text-to-Image model often requires multiple trials, +where human users iteratively update their prompt based on feedback, namely the +output image. Taking inspiration from cognitive work on reference games and +dialogue alignment, this paper analyzes the dynamics of the user prompts along +such iterations. We compile a dataset of iterative interactions of human users +with Midjourney. Our analysis then reveals that prompts predictably converge +toward specific traits along these iterations. We further study whether this +convergence is due to human users, realizing they missed important details, or +due to adaptation to the model's ``preferences'', producing better images for a +specific language style. We show initial evidence that both possibilities are +at play. The possibility that users adapt to the model's preference raises +concerns about reusing user data for further training. The prompts may be +biased towards the preferences of a specific model, rather than align with +human intentions and natural manner of expression. + +
+
+ comment: EMNLP23 +
+
+
+
+
+ + ♻ ☆ Open-Ended Instructable Embodied Agents with Memory-Augmented Large + Language Models + + +
+ Pre-trained and frozen large language models (LLMs) can effectively map +simple scene rearrangement instructions to programs over a robot's visuomotor +functions through appropriate few-shot example prompting. To parse open-domain +natural language and adapt to a user's idiosyncratic procedures, not known +during prompt engineering time, fixed prompts fall short. In this paper, we +introduce HELPER, an embodied agent equipped with an external memory of +language-program pairs that parses free-form human-robot dialogue into action +programs through retrieval-augmented LLM prompting: relevant memories are +retrieved based on the current dialogue, instruction, correction, or VLM +description, and used as in-context prompt examples for LLM querying. The +memory is expanded during deployment to include pairs of user's language and +action plans, to assist future inferences and personalize them to the user's +language and routines. HELPER sets a new state-of-the-art in the TEACh +benchmark in both Execution from Dialog History (EDH) and Trajectory from +Dialogue (TfD), with a 1.7x improvement over the previous state-of-the-art for +TfD. Our models, code, and video results can be found in our project's website: +https://helper-agent-llm.github.io. + +
+
+ comment: Project page with code & videos: https://helper-agent-llm.github.io +
+
+
+
+
+ + ♻ ☆ A Language and Its Dimensions: Intrinsic Dimensions of Language Fractal + Structures + + +
+ The present paper introduces a novel object of study - a language fractal +structure. We hypothesize that a set of embeddings of all $n$-grams of a +natural language constitutes a representative sample of this fractal set. (We +use the term Hailonakea to refer to the sum total of all language fractal +structures, over all $n$). The paper estimates intrinsic (genuine) dimensions +of language fractal structures for the Russian and English languages. To this +end, we employ methods based on (1) topological data analysis and (2) a minimum +spanning tree of a data graph for a cloud of points considered (Steele +theorem). For both languages, for all $n$, the intrinsic dimensions appear to +be non-integer values (typical for fractal sets), close to 9 for both of the +Russian and English language. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ♻ ☆ Language Varieties of Italy: Technology Challenges and Opportunities ACL + + +
+ Italy is characterized by a one-of-a-kind linguistic diversity landscape in +Europe, which implicitly encodes local knowledge, cultural traditions, artistic +expressions and history of its speakers. However, most local languages and +dialects in Italy are at risk of disappearing within few generations. The NLP +community has recently begun to engage with endangered languages, including +those of Italy. Yet, most efforts assume that these varieties are +under-resourced language monoliths with an established written form and +homogeneous functions and needs, and thus highly interchangeable with each +other and with high-resource, standardized languages. In this paper, we +introduce the linguistic context of Italy and challenge the default +machine-centric assumptions of NLP for Italy's language varieties. We advocate +for a shift in the paradigm from machine-centric to speaker-centric NLP, and +provide recommendations and opportunities for work that prioritizes languages +and their speakers over technological advances. To facilitate the process, we +finally propose building a local community towards responsible, participatory +efforts aimed at supporting vitality of languages and dialects of Italy. + +
+
+ comment: Accepted to TACL. This arXiv version is a pre-MIT Press publication + version +
+
+
+
+
+ + ♻ ☆ Unsupervised Opinion Summarization Using Approximate Geodesics EMNLP 2023 + + +
+ Opinion summarization is the task of creating summaries capturing popular +opinions from user reviews. In this paper, we introduce Geodesic Summarizer +(GeoSumm), a novel system to perform unsupervised extractive opinion +summarization. GeoSumm involves an encoder-decoder based representation +learning model, that generates representations of text as a distribution over +latent semantic units. GeoSumm generates these representations by performing +dictionary learning over pre-trained text representations at multiple decoder +layers. We then use these representations to quantify the relevance of review +sentences using a novel approximate geodesic distance based scoring mechanism. +We use the relevance scores to identify popular opinions in order to compose +general and aspect-specific summaries. Our proposed model, GeoSumm, achieves +state-of-the-art performance on three opinion summarization datasets. We +perform additional experiments to analyze the functioning of our model and +showcase the generalization ability of {\X} across different domains. + +
+
+ comment: Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ MEAL: Stable and Active Learning for Few-Shot Prompting EMNLP 2023 + + +
+ Few-shot classification has made great strides due to foundation models that, +through priming and prompting, are highly effective few-shot learners. However, +this approach has high variance both across different sets of few shots (data +selection) and across different finetuning runs (run variability). This is +problematic not only because it impedes the fair comparison of different +approaches, but especially because it makes few-shot learning too unreliable +for many real-world applications. To alleviate these issues, we make two +contributions for more stable and effective few-shot learning: First, we +propose novel ensembling methods and show that they substantially reduce run +variability. Second, we introduce a new active learning (AL) criterion for data +selection and present the first AL-based approach specifically tailored towards +prompt-based learning. In our experiments, we show that our combined method, +MEAL (Multiprompt finetuning and prediction Ensembling with Active Learning), +improves overall performance of prompt-based finetuning by 2.3 points on five +diverse tasks. We publicly share our code and data splits in +https://github.com/akoksal/MEAL. + +
+
+ comment: EMNLP 2023 Findings +
+
+
+
+
+ + ♻ ☆ Language-Agnostic Bias Detection in Language Models with Bias Probing EMNLP 2023 + + +
+ Pretrained language models (PLMs) are key components in NLP, but they contain +strong social biases. Quantifying these biases is challenging because current +methods focusing on fill-the-mask objectives are sensitive to slight changes in +input. To address this, we propose a bias probing technique called LABDet, for +evaluating social bias in PLMs with a robust and language-agnostic method. For +nationality as a case study, we show that LABDet `surfaces' nationality bias by +training a classifier on top of a frozen PLM on non-nationality sentiment +detection. We find consistent patterns of nationality bias across monolingual +PLMs in six languages that align with historical and political context. We also +show for English BERT that bias surfaced by LABDet correlates well with bias in +the pretraining data; thus, our work is one of the few studies that directly +links pretraining data to PLM behavior. Finally, we verify LABDet's reliability +and applicability to different templates and languages through an extensive set +of robustness checks. We publicly share our code and dataset in +https://github.com/akoksal/LABDet. + +
+
+ comment: EMNLP 2023 Findings +
+
+
+
+
+ + ♻ ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborate their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. To +explore the potential of open-source LLM, we investigate them in various +scenarios, and further enhance their performance with supervised fine-tuning. +Our explorations highlight open-source LLMs' potential in Text-to-SQL, as well +as the advantages and disadvantages of the supervised fine-tuning. +Additionally, towards an efficient and economic LLM-based Text-to-SQL solution, +we emphasize the token efficiency in prompt engineering and compare the prior +studies under this metric. We hope that our work provides a deeper +understanding of Text-to-SQL with LLMs, and inspires further investigations and +broad applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ♻ ☆ A novel approach to measuring patent claim scope based on probabilities + obtained from (large) language models + + +
+ This work proposes to measure the scope of a patent claim as the reciprocal +of the self-information contained in this claim. A probability of occurrence of +the claim is obtained from a language model and this probability is used to +compute the self-information. Grounded in information theory, this approach is +based on the assumption that an unlikely concept is more informative than a +usual concept, insofar as it is more surprising. In turn, the more surprising +the information required to defined the claim, the narrower its scope. Five +language models are considered, ranging from simplest models (each word or +character is assigned an identical probability) to intermediate models (using +average word or character frequencies), to a large language model (GPT2). +Interestingly, the scope resulting from the simplest language models is +proportional to the reciprocal of the number of words or characters involved in +the claim, a metric already used in previous works. Application is made to +multiple series of patent claims directed to distinct inventions, where each +series consists of claims devised to have a gradually decreasing scope. The +performance of the language models is assessed with respect to several ad hoc +tests. The more sophisticated the model, the better the results. I.e., the GPT2 +probability model outperforms models based on word and character frequencies, +which themselves outdo the simplest models based on word or character counts. +Still, the character count appears to be a more reliable indicator than the +word count. + +
+
+ comment: 58 pages, 8 tables, 6 figures. Substantial changes made to version 2: + New section 4.1 added (including a new table); Minor normalization issue + corrected in values listed in Appendix B; Content of former appendix C now + moved to Section 3; and new Appendix C added. Minor changes made to version 3 + (style, typos, language) +
+
+
+
+
+ + ♻ ☆ Women Wearing Lipstick: Measuring the Bias Between an Object and Its + Related Gender EMNLP + + +
+ In this paper, we investigate the impact of objects on gender bias in image +captioning systems. Our results show that only gender-specific objects have a +strong gender bias (e.g., women-lipstick). In addition, we propose a visual +semantic-based gender score that measures the degree of bias and can be used as +a plug-in for any image captioning system. Our experiments demonstrate the +utility of the gender score, since we observe that our score can measure the +bias relation between a caption and its related gender; therefore, our score +can be used as an additional metric to the existing Object Gender Co-Occ +approach. Code and data are publicly available at +\url{https://github.com/ahmedssabir/GenderScore}. + +
+
+ comment: EMNLP Findings 2023 +
+
+
+
+
+ + ♻ ☆ Attribution Patching Outperforms Automated Circuit Discovery NeurIPS 2023 + + +
+ Automated interpretability research has recently attracted attention as a +potential research direction that could scale explanations of neural network +behavior to large models. Existing automated circuit discovery work applies +activation patching to identify subnetworks responsible for solving specific +tasks (circuits). In this work, we show that a simple method based on +attribution patching outperforms all existing methods while requiring just two +forward passes and a backward pass. We apply a linear approximation to +activation patching to estimate the importance of each edge in the +computational subgraph. Using this approximation, we prune the least important +edges of the network. We survey the performance and limitations of this method, +finding that averaged over all tasks our method has greater AUC from circuit +recovery than other methods. + +
+
+ comment: 6 main paper pages, 6 additional pages. NeurIPS 2023 ATTRIB Workshop +
+
+
+
+
+ + ♻ ☆ StyleTTS: A Style-Based Generative Model for Natural and Diverse + Text-to-Speech Synthesis + + +
+ Text-to-Speech (TTS) has recently seen great progress in synthesizing +high-quality speech owing to the rapid development of parallel TTS systems, but +producing speech with naturalistic prosodic variations, speaking styles and +emotional tones remains challenging. Moreover, since duration and speech are +generated separately, parallel TTS models still have problems finding the best +monotonic alignments that are crucial for naturalistic speech synthesis. Here, +we propose StyleTTS, a style-based generative model for parallel TTS that can +synthesize diverse speech with natural prosody from a reference speech +utterance. With novel Transferable Monotonic Aligner (TMA) and +duration-invariant data augmentation schemes, our method significantly +outperforms state-of-the-art models on both single and multi-speaker datasets +in subjective tests of speech naturalness and speaker similarity. Through +self-supervised learning of the speaking styles, our model can synthesize +speech with the same prosodic and emotional tone as any given reference speech +without the need for explicitly labeling these categories. + +
+
+
+
+
+ + ♻ ☆ StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion + and Adversarial Training with Large Speech Language Models NeurIPS 2023 + + +
+ In this paper, we present StyleTTS 2, a text-to-speech (TTS) model that +leverages style diffusion and adversarial training with large speech language +models (SLMs) to achieve human-level TTS synthesis. StyleTTS 2 differs from its +predecessor by modeling styles as a latent random variable through diffusion +models to generate the most suitable style for the text without requiring +reference speech, achieving efficient latent diffusion while benefiting from +the diverse speech synthesis offered by diffusion models. Furthermore, we +employ large pre-trained SLMs, such as WavLM, as discriminators with our novel +differentiable duration modeling for end-to-end training, resulting in improved +speech naturalness. StyleTTS 2 surpasses human recordings on the single-speaker +LJSpeech dataset and matches it on the multispeaker VCTK dataset as judged by +native English speakers. Moreover, when trained on the LibriTTS dataset, our +model outperforms previous publicly available models for zero-shot speaker +adaptation. This work achieves the first human-level TTS on both single and +multispeaker datasets, showcasing the potential of style diffusion and +adversarial training with large SLMs. The audio demos and source code are +available at https://styletts2.github.io/. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Transferring Procedural Knowledge across Commonsense Tasks + + +
+ Stories about everyday situations are an essential part of human +communication, motivating the need to develop AI agents that can reliably +understand these stories. Despite the long list of supervised methods for story +completion and procedural understanding, current AI has no mechanisms to +automatically track and explain procedures in unseen stories. To bridge this +gap, we study the ability of AI models to transfer procedural knowledge to +novel narrative tasks in a transparent manner. We design LEAP: a comprehensive +framework that integrates state-of-the-art modeling architectures, training +regimes, and augmentation strategies based on both natural and synthetic +stories. To address the lack of densely annotated training data, we devise a +robust automatic labeler based on few-shot prompting to enhance the augmented +data. Our experiments with in- and out-of-domain tasks reveal insights into the +interplay of different architectures, training regimes, and augmentation +strategies. LEAP's labeler has a clear positive impact on out-of-domain +datasets, while the resulting dense annotation provides native explainability. + +
+
+
+
+
+ + ♻ ☆ Solving Math Word Problems with Reexamination NeurIPS2023 + + +
+ Math word problem (MWP) solving aims to understand the descriptive math +problem and calculate the result, for which previous efforts are mostly devoted +to upgrade different technical modules. This paper brings a different +perspective of \textit{reexamination process} during training by introducing a +pseudo-dual task to enhance the MWP solving. We propose a pseudo-dual (PseDual) +learning scheme to model such process, which is model-agnostic thus can be +adapted to any existing MWP solvers. The pseudo-dual task is specifically +defined as filling the numbers in the expression back into the original word +problem with numbers masked. To facilitate the effective joint learning of the +two tasks, we further design a scheduled fusion strategy for the number +infilling task, which smoothly switches the input from the ground-truth math +expressions to the predicted ones. Our pseudo-dual learning scheme has been +tested and proven effective when being equipped in several representative MWP +solvers through empirical studies. \textit{The codes and trained models are +available at:} \url{https://github.com/steven640pixel/PsedualMWP}. +\end{abstract} + +
+
+ comment: To be appeared at NeurIPS2023 Workshop on MATH-AI +
+
+
+
+
+ + ♻ ☆ Camels in a Changing Climate: Enhancing LM Adaptation with Tulu 2 + + +
+ Since the release of T\"ULU [Wang et al., 2023b], open resources for +instruction tuning have developed quickly, from better base models to new +finetuning techniques. We test and incorporate a number of these advances into +T\"ULU, resulting in T\"ULU 2, a suite of improved T\"ULU models for advancing +the understanding and best practices of adapting pretrained language models to +downstream tasks and user preferences. Concretely, we release: (1) +T\"ULU-V2-mix, an improved collection of high-quality instruction datasets; (2) +T\"ULU 2, LLAMA-2 models finetuned on the V2 mixture; (3) T\"ULU 2+DPO, T\"ULU +2 models trained with direct preference optimization (DPO), including the +largest DPO-trained model to date (T\"ULU 2+DPO 70B); (4) CODE T\"ULU 2, CODE +LLAMA models finetuned on our V2 mix that outperform CODE LLAMA and its +instruction-tuned variant, CODE LLAMA-Instruct. Our evaluation from multiple +perspectives shows that the T\"ULU 2 suite achieves state-of-the-art +performance among open models and matches or exceeds the performance of +GPT-3.5-turbo-0301 on several benchmarks. We release all the checkpoints, data, +training and evaluation code to facilitate future open efforts on adapting +large language models. + +
+
+ comment: technical report; fixed zephyr numbers +
+
+
+
+
+ + ♻ ☆ Effective Proxy for Human Labeling: Ensemble Disagreement Scores in + Large Language Models for Industrial NLP EMNLP + + +
+ Large language models (LLMs) have demonstrated significant capability to +generalize across a large number of NLP tasks. For industry applications, it is +imperative to assess the performance of the LLM on unlabeled production data +from time to time to validate for a real-world setting. Human labeling to +assess model error requires considerable expense and time delay. Here we +demonstrate that ensemble disagreement scores work well as a proxy for human +labeling for language models in zero-shot, few-shot, and fine-tuned settings, +per our evaluation on keyphrase extraction (KPE) task. We measure fidelity of +the results by comparing to true error measured from human labeled ground +truth. We contrast with the alternative of using another LLM as a source of +machine labels, or silver labels. Results across various languages and domains +show disagreement scores provide a better estimation of model performance with +mean average error (MAE) as low as 0.4% and on average 13.8% better than using +silver labels. + +
+
+ comment: Camera ready version for 2023 EMNLP (The Third Workshop on Natural + Language Generation, Evaluation, and Metrics (GEM)) +
+
+
+
+
+ + ♻ ☆ Landmark Attention: Random-Access Infinite Context Length for + Transformers NeurIPS 2023 + + +
+ While Transformers have shown remarkable success in natural language +processing, their attention mechanism's large memory requirements have limited +their ability to handle longer contexts. Prior approaches, such as recurrent +memory or retrieval-based augmentation, have either compromised the +random-access flexibility of attention (i.e., the capability to select any +token in the entire context) or relied on separate mechanisms for relevant +context retrieval, which may not be compatible with the model's attention. In +this paper, we present a novel approach that allows access to the complete +context while retaining random-access flexibility, closely resembling running +attention on the entire context. Our method uses a landmark token to represent +each block of the input and trains the attention to use it for selecting +relevant blocks, enabling retrieval of blocks directly through the attention +mechanism instead of by relying on a separate mechanism. Our approach +seamlessly integrates with specialized data structures and the system's memory +hierarchy, enabling processing of arbitrarily long context lengths. We +demonstrate that our method can obtain comparable performance with +Transformer-XL while significantly reducing the number of retrieved tokens in +each step. Finally, we show that fine-tuning LLaMA 7B with our method +successfully extends its context length capacity to over 32k tokens, allowing +for inference at the context lengths of GPT-4. We release the implementation of +landmark attention and the code to reproduce our experiments at +https://github.com/epfml/landmark-attention/. + +
+
+ comment: Published as a conference paper at NeurIPS 2023 - 37th Conference on + Neural Information Processing Systems +
+
+
+
+
+ + ♻ ☆ Lost in the Middle: How Language Models Use Long Contexts ACL + + +
+ While recent language models have the ability to take long contexts as input, +relatively little is known about how well they use longer context. We analyze +the performance of language models on two tasks that require identifying +relevant information in their input contexts: multi-document question answering +and key-value retrieval. We find that performance can degrade significantly +when changing the position of relevant information, indicating that current +language models do not robustly make use of information in long input contexts. +In particular, we observe that performance is often highest when relevant +information occurs at the beginning or end of the input context, and +significantly degrades when models must access relevant information in the +middle of long contexts, even for explicitly long-context models. Our analysis +provides a better understanding of how language models use their input context +and provides new evaluation protocols for future long-context language models. + +
+
+ comment: 18 pages, 16 figures. Accepted for publication in Transactions of the + Association for Computational Linguistics (TACL), 2023 +
+
+
+
+
+ + ♻ ☆ Data Contamination Quiz: A Tool to Detect and Estimate Contamination in + Large Language Models + + +
+ We propose the Data Contamination Quiz, a simple and effective approach to +detect data contamination in large language models (LLMs) and estimate the +amount of it. Specifically, we frame data contamination detection as a series +of multiple-choice questions. We devise a quiz format wherein three perturbed +versions of each dataset instance are created. These changes only include +word-level perturbations, replacing words with their contextual synonyms, +ensuring both the semantic and sentence structure remain exactly the same as +the original instance. Together with the original instance, these perturbed +versions constitute the choices in the quiz. Given that the only distinguishing +signal among these choices is the exact wording, an LLM, when tasked with +identifying the original instance from the choices, opts for the original if it +has memorized it in its pre-training phase--a trait intrinsic to LLMs. A +dataset partition is then marked as contaminated if the LLM's performance on +the quiz surpasses what random chance suggests. Our evaluation spans seven +datasets and their respective splits (train and test/validation) on two +state-of-the-art LLMs: GPT-4 and GPT-3.5. While lacking access to the +pre-training data, our results suggest that our approach not only enhances the +detection of data contamination but also provides an accurate estimation of its +extent, even when the contamination signal is weak. + +
+
+ comment: v1.2 preprint +
+
+
+
+
+ + ♻ ☆ Generative Antibody Design for Complementary Chain Pairing Sequences + through Encoder-Decoder Language Model + + +
+ Current protein language models (pLMs) predominantly focus on single-chain +protein sequences and often have not accounted for constraints on generative +design imposed by protein-protein interactions. To address this gap, we present +paired Antibody T5 (pAbT5), an encoder-decoder model to generate complementary +heavy or light chain from its pairing partner. We show that our model respects +conservation in framework regions and variability in hypervariable domains, +demonstrated by agreement with sequence alignment and variable-length CDR +loops. We also show that our model captures chain pairing preferences through +the recovery of ground-truth chain type and gene families. Our results showcase +the potential of pAbT5 in generative antibody design, incorporating biological +constraints from chain pairing preferences. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 137 + +
+
+
+ + ☆ Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose + Estimation + + +
+ Transformers have been successfully applied in the field of video-based 3D +human pose estimation. However, the high computational costs of these video +pose transformers (VPTs) make them impractical on resource-constrained devices. +In this paper, we present a plug-and-play pruning-and-recovering framework, +called Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose +estimation from videos. Our HoT begins with pruning pose tokens of redundant +frames and ends with recovering full-length tokens, resulting in a few pose +tokens in the intermediate transformer blocks and thus improving the model +efficiency. To effectively achieve this, we propose a token pruning cluster +(TPC) that dynamically selects a few representative tokens with high semantic +diversity while eliminating the redundancy of video frames. In addition, we +develop a token recovering attention (TRA) to restore the detailed +spatio-temporal information based on the selected tokens, thereby expanding the +network output to the original full-length temporal resolution for fast +inference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and +MPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and +estimation accuracy compared to the original VPT models. For instance, applying +to MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs +without sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop, +respectively. Our source code will be open-sourced. + +
+
+
+
+
+ + ☆ PF-LRM: Pose-Free Large Reconstruction Model for Joint Pose and Shape + Prediction + + +
+ We propose a Pose-Free Large Reconstruction Model (PF-LRM) for reconstructing +a 3D object from a few unposed images even with little visual overlap, while +simultaneously estimating the relative camera poses in ~1.3 seconds on a single +A100 GPU. PF-LRM is a highly scalable method utilizing the self-attention +blocks to exchange information between 3D object tokens and 2D image tokens; we +predict a coarse point cloud for each view, and then use a differentiable +Perspective-n-Point (PnP) solver to obtain camera poses. When trained on a huge +amount of multi-view posed data of ~1M objects, PF-LRM shows strong +cross-dataset generalization ability, and outperforms baseline methods by a +large margin in terms of pose prediction accuracy and 3D reconstruction quality +on various unseen evaluation datasets. We also demonstrate our model's +applicability in downstream text/image-to-3D task with fast feed-forward +inference. Our project website is at: https://totoro97.github.io/pf-lrm . + +
+
+ comment: Project website: https://totoro97.github.io/pf-lrm +
+
+
+
+
+ + ☆ GPT-4V(ision) for Robotics: Multimodal Task Planning from Human + Demonstration + + +
+ We introduce a pipeline that enhances a general-purpose Vision Language +Model, GPT-4V(ision), by integrating observations of human actions to +facilitate robotic manipulation. This system analyzes videos of humans +performing tasks and creates executable robot programs that incorporate +affordance insights. The computation starts by analyzing the videos with GPT-4V +to convert environmental and action details into text, followed by a +GPT-4-empowered task planner. In the following analyses, vision systems +reanalyze the video with the task plan. Object names are grounded using an +open-vocabulary object detector, while focus on the hand-object relation helps +to detect the moment of grasping and releasing. This spatiotemporal grounding +allows the vision systems to further gather affordance data (e.g., grasp type, +way points, and body postures). Experiments across various scenarios +demonstrate this method's efficacy in achieving real robots' operations from +human demonstrations in a zero-shot manner. The prompts of GPT-4V/GPT-4 are +available at this project page: +https://microsoft.github.io/GPT4Vision-Robot-Manipulation-Prompts/ + +
+
+ comment: 8 pages, 10 figures, 1 table. Last updated on November 20th, 2023 +
+
+
+
+
+ + ☆ Exploring Lip Segmentation Techniques in Computer Vision: A Comparative + Analysis + + +
+ Lip segmentation is crucial in computer vision, especially for lip reading. +Despite extensive face segmentation research, lip segmentation has received +limited attention. The aim of this study is to compare state-of-the-art lip +segmentation models using a standardized setting and a publicly available +dataset. Five techniques, namely EHANet, Mask2Former, BiSeNet V2, PIDNet, and +STDC1, are qualitatively selected based on their reported performance, +inference time, code availability, recency, and popularity. The CelebAMask-HQ +dataset, comprising manually annotated face images, is used to fairly assess +the lip segmentation performance of the selected models. Inference experiments +are conducted on a Raspberry Pi4 to emulate limited computational resources. +The results show that Mask2Former and EHANet have the best performances in +terms of mIoU score. BiSeNet V2 demonstrate competitive performance, while +PIDNet excels in recall but has lower precision. Most models present inference +time ranging from 1000 to around 3000 milliseconds on a Raspberry Pi4, with +PIDNet having the lowest mean inference time. This study provides a +comprehensive evaluation of lip segmentation models, highlighting their +performance and inference times. The findings contribute to the development of +lightweight techniques and establish benchmarks for future advances in lip +segmentation, especially in IoT and edge computing scenarios. + +
+
+
+
+
+ + ☆ Categorizing the Visual Environment and Analyzing the Visual Attention + of Dogs WACV + + +
+ Dogs have a unique evolutionary relationship with humans and serve many +important roles e.g. search and rescue, blind assistance, emotional support. +However, few datasets exist to categorize visual features and objects available +to dogs, as well as how dogs direct their visual attention within their +environment. We collect and study a dataset with over 11,698 gazes to +categorize the objects available to be gazed at by 11 dogs in everyday outdoor +environments i.e. a walk around a college campus and urban area. We explore the +availability of these object categories and the visual attention of dogs over +these categories using a head mounted eye tracking apparatus. A small portion +(approx. 600 images or < 20% of total dataset) of the collected data is used to +fine tune a MaskRCNN for the novel image domain to segment objects present in +the scene, enabling further statistical analysis on the visual gaze tendencies +of dogs. The MaskRCNN, with eye tracking apparatus, serves as an end to end +model for automatically classifying the visual fixations of dogs. The fine +tuned MaskRCNN performs far better than chance. There are few individual +differences between the 11 dogs and we observe greater visual fixations on +buses, plants, pavement, and construction equipment. This work takes a step +towards understanding visual behavior of dogs and their interaction with the +physical world. + +
+
+ comment: 13 pages, 11 figures, 1 table, WACV CV4Smalls Workshop +
+
+
+
+
+ + ☆ Leveraging Previous Facial Action Units Knowledge for Emotion + Recognition on Faces + + +
+ People naturally understand emotions, thus permitting a machine to do the +same could open new paths for human-computer interaction. Facial expressions +can be very useful for emotion recognition techniques, as these are the biggest +transmitters of non-verbal cues capable of being correlated with emotions. +Several techniques are based on Convolutional Neural Networks (CNNs) to extract +information in a machine learning process. However, simple CNNs are not always +sufficient to locate points of interest on the face that can be correlated with +emotions. In this work, we intend to expand the capacity of emotion recognition +techniques by proposing the usage of Facial Action Units (AUs) recognition +techniques to recognize emotions. This recognition will be based on the Facial +Action Coding System (FACS) and computed by a machine learning system. In +particular, our method expands over EmotiRAM, an approach for multi-cue emotion +recognition, in which we improve over their facial encoding module. + +
+
+
+
+
+ + ☆ Evaluating Supervision Levels Trade-Offs for Infrared-Based People + Counting WACV + + +
+ Object detection models are commonly used for people counting (and +localization) in many applications but require a dataset with costly bounding +box annotations for training. Given the importance of privacy in people +counting, these models rely more and more on infrared images, making the task +even harder. In this paper, we explore how weaker levels of supervision can +affect the performance of deep person counting architectures for image +classification and point-level localization. Our experiments indicate that +counting people using a CNN Image-Level model achieves competitive results with +YOLO detectors and point-level models, yet provides a higher frame rate and a +similar amount of model parameters. + +
+
+ comment: Accepted in IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2024 +
+
+
+
+
+ + ☆ LiDAR-HMR: 3D Human Mesh Recovery from LiDAR + + +
+ In recent years, point cloud perception tasks have been garnering increasing +attention. This paper presents the first attempt to estimate 3D human body mesh +from sparse LiDAR point clouds. We found that the major challenge in estimating +human pose and mesh from point clouds lies in the sparsity, noise, and +incompletion of LiDAR point clouds. Facing these challenges, we propose an +effective sparse-to-dense reconstruction scheme to reconstruct 3D human mesh. +This involves estimating a sparse representation of a human (3D human pose) and +gradually reconstructing the body mesh. To better leverage the 3D structural +information of point clouds, we employ a cascaded graph transformer +(graphormer) to introduce point cloud features during sparse-to-dense +reconstruction. Experimental results on three publicly available databases +demonstrate the effectiveness of the proposed approach. Code: +https://github.com/soullessrobot/LiDAR-HMR/ + +
+
+ comment: Code is available at: https://github.com/soullessrobot/LiDAR-HMR/ +
+
+
+
+
+ + ☆ SA-Med2D-20M Dataset: Segment Anything in 2D Medical Imaging with 20 + Million masks + + +
+ Segment Anything Model (SAM) has achieved impressive results for natural +image segmentation with input prompts such as points and bounding boxes. Its +success largely owes to massive labeled training data. However, directly +applying SAM to medical image segmentation cannot perform well because SAM +lacks medical knowledge -- it does not use medical images for training. To +incorporate medical knowledge into SAM, we introduce SA-Med2D-20M, a +large-scale segmentation dataset of 2D medical images built upon numerous +public and private datasets. It consists of 4.6 million 2D medical images and +19.7 million corresponding masks, covering almost the whole body and showing +significant diversity. This paper describes all the datasets collected in +SA-Med2D-20M and details how to process these datasets. Furthermore, +comprehensive statistics of SA-Med2D-20M are presented to facilitate the better +use of our dataset, which can help the researchers build medical vision +foundation models or apply their models to downstream medical applications. We +hope that the large scale and diversity of SA-Med2D-20M can be leveraged to +develop medical artificial intelligence for enhancing diagnosis, medical image +analysis, knowledge sharing, and education. The data with the redistribution +license is publicly available at https://github.com/OpenGVLab/SAM-Med2D. + +
+
+
+
+
+ + ☆ What Can AutoML Do For Continual Learning? + + +
+ This position paper outlines the potential of AutoML for incremental +(continual) learning to encourage more research in this direction. Incremental +learning involves incorporating new data from a stream of tasks and +distributions to learn enhanced deep representations and adapt better to new +tasks. However, a significant limitation of incremental learners is that most +current techniques freeze the backbone architecture, hyperparameters, and the +order & structure of the learning tasks throughout the learning and adaptation +process. We strongly believe that AutoML offers promising solutions to address +these limitations, enabling incremental learning to adapt to more diverse +real-world tasks. Therefore, instead of directly proposing a new method, this +paper takes a step back by posing the question: "What can AutoML do for +incremental learning?" We outline three key areas of research that can +contribute to making incremental learners more dynamic, highlighting concrete +opportunities to apply AutoML methods in novel ways as well as entirely new +challenges for AutoML research. + +
+
+
+
+
+ + ☆ NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly + Generation + + +
+ Anomaly detection (AD) is essential in identifying rare and often critical +events in complex systems, finding applications in fields such as network +intrusion detection, financial fraud detection, and fault detection in +infrastructure and industrial systems. While AD is typically treated as an +unsupervised learning task due to the high cost of label annotation, it is more +practical to assume access to a small set of labeled anomaly samples from +domain experts, as is the case for semi-supervised anomaly detection. +Semi-supervised and supervised approaches can leverage such labeled data, +resulting in improved performance. In this paper, rather than proposing a new +semi-supervised or supervised approach for AD, we introduce a novel algorithm +for generating additional pseudo-anomalies on the basis of the limited labeled +anomalies and a large volume of unlabeled data. This serves as an augmentation +to facilitate the detection of new anomalies. Our proposed algorithm, named +Nearest Neighbor Gaussian Mixup (NNG-Mix), efficiently integrates information +from both labeled and unlabeled data to generate pseudo-anomalies. We compare +the performance of this novel algorithm with commonly applied augmentation +techniques, such as Mixup and Cutout. We evaluate NNG-Mix by training various +existing semi-supervised and supervised anomaly detection algorithms on the +original training data along with the generated pseudo-anomalies. Through +extensive experiments on 57 benchmark datasets in ADBench, reflecting different +data types, we demonstrate that NNG-Mix outperforms other data augmentation +methods. It yields significant performance improvements compared to the +baselines trained exclusively on the original training data. Notably, NNG-Mix +yields up to 16.4%, 8.8%, and 8.0% improvements on Classical, CV, and NLP +datasets in ADBench. Our source code will be available at +https://github.com/donghao51/NNG-Mix. + +
+
+
+
+
+ + ☆ An Image is Worth Multiple Words: Multi-attribute Inversion for + Constrained Text-to-Image Synthesis + + +
+ We consider the problem of constraining diffusion model outputs with a +user-supplied reference image. Our key objective is to extract multiple +attributes (e.g., color, object, layout, style) from this single reference +image, and then generate new samples with them. One line of existing work +proposes to invert the reference images into a single textual conditioning +vector, enabling generation of new samples with this learned token. These +methods, however, do not learn multiple tokens that are necessary to condition +model outputs on the multiple attributes noted above. Another line of +techniques expand the inversion space to learn multiple embeddings but they do +this only along the layer dimension (e.g., one per layer of the DDPM model) or +the timestep dimension (one for a set of timesteps in the denoising process), +leading to suboptimal attribute disentanglement. To address the aforementioned +gaps, the first contribution of this paper is an extensive analysis to +determine which attributes are captured in which dimension of the denoising +process. As noted above, we consider both the time-step dimension (in reverse +denoising) as well as the DDPM model layer dimension. We observe that often a +subset of these attributes are captured in the same set of model layers and/or +across same denoising timesteps. For instance, color and style are captured +across same U-Net layers, whereas layout and color are captured across same +timestep stages. Consequently, an inversion process that is designed only for +the time-step dimension or the layer dimension is insufficient to disentangle +all attributes. This leads to our second contribution where we design a new +multi-attribute inversion algorithm, MATTE, with associated +disentanglement-enhancing regularization losses, that operates across both +dimensions and explicitly leads to four disentangled tokens (color, style, +layout, and object). + +
+
+
+
+
+ + ☆ Generalization of Fitness Exercise Recognition from Doppler Measurements + by Domain-adaption and Few-Shot Learning ICPR + + +
+ In previous works, a mobile application was developed using an unmodified +commercial off-the-shelf smartphone to recognize whole-body exercises. The +working principle was based on the ultrasound Doppler sensing with the device +built-in hardware. Applying such a lab-environment trained model on realistic +application variations causes a significant drop in performance, and thus +decimate its applicability. The reason of the reduced performance can be +manifold. It could be induced by the user, environment, and device variations +in realistic scenarios. Such scenarios are often more complex and diverse, +which can be challenging to anticipate in the initial training data. To study +and overcome this issue, this paper presents a database with controlled and +uncontrolled subsets of fitness exercises. We propose two concepts to utilize +small adaption data to successfully improve model generalization in an +uncontrolled environment, increasing the recognition accuracy by two to six +folds compared to the baseline for different users. + +
+
+ comment: accepted at International Conference on Pattern Recognition (ICPR) + workshop 2021 +
+
+
+
+
+ + ☆ Continual Learning: Applications and the Road Forward + + +
+ Continual learning is a sub-field of machine learning, which aims to allow +machine learning models to continuously learn on new data, by accumulating +knowledge without forgetting what was learned in the past. In this work, we +take a step back, and ask: "Why should one care about continual learning in the +first place?". We set the stage by surveying recent continual learning papers +published at three major machine learning conferences, and show that +memory-constrained settings dominate the field. Then, we discuss five open +problems in machine learning, and even though they seem unrelated to continual +learning at first sight, we show that continual learning will inevitably be +part of their solution. These problems are model-editing, personalization, +on-device learning, faster (re-)training and reinforcement learning. Finally, +by comparing the desiderata from these unsolved problems and the current +assumptions in continual learning, we highlight and discuss four future +directions for continual learning research. We hope that this work offers an +interesting perspective on the future of continual learning, while displaying +its potential value and the paths we have to pursue in order to make it +successful. This work is the result of the many discussions the authors had at +the Dagstuhl seminar on Deep Continual Learning, in March 2023. + +
+
+
+
+
+ + ☆ LLMs as Visual Explainers: Advancing Image Classification with Evolving + Visual Descriptions + + +
+ Vision-language models (VLMs) offer a promising paradigm for image +classification by comparing the similarity between images and class embeddings. +A critical challenge lies in crafting precise textual representations for class +names. While previous studies have leveraged recent advancements in large +language models (LLMs) to enhance these descriptors, their outputs often suffer +from ambiguity and inaccuracy. We identify two primary causes: 1) The prevalent +reliance on textual interactions with LLMs, leading to a mismatch between the +generated text and the visual content in VLMs' latent space - a phenomenon we +term the "explain without seeing" dilemma. 2) The oversight of the inter-class +relationships, resulting in descriptors that fail to differentiate similar +classes effectively. To address these issues, we propose a novel image +classification framework combining VLMs with LLMs, named Iterative Optimization +with Visual Feedback. In particular, our method develops an LLM-based agent, +employing an evolutionary optimization strategy to refine class descriptors. +Crucially, we incorporate visual feedback from VLM classification metrics, +thereby guiding the optimization process with concrete visual data. Our method +leads to improving accuracy on a wide range of image classification benchmarks, +with 3.47\% average gains over state-of-the-art methods. We also highlight the +resulting descriptions serve as explainable and robust features that can +consistently improve the performance across various backbone models. + +
+
+
+
+
+ + ☆ Identifying the Defective: Detecting Damaged Grains for Cereal + Appearance Inspection ECAI2023 + + +
+ Cereal grain plays a crucial role in the human diet as a major source of +essential nutrients. Grain Appearance Inspection (GAI) serves as an essential +process to determine grain quality and facilitate grain circulation and +processing. However, GAI is routinely performed manually by inspectors with +cumbersome procedures, which poses a significant bottleneck in smart +agriculture. + In this paper, we endeavor to develop an automated GAI system:AI4GrainInsp. +By analyzing the distinctive characteristics of grain kernels, we formulate GAI +as a ubiquitous problem: Anomaly Detection (AD), in which healthy and edible +kernels are considered normal samples while damaged grains or unknown objects +are regarded as anomalies. We further propose an AD model, called AD-GAI, which +is trained using only normal samples yet can identify anomalies during +inference. Moreover, we customize a prototype device for data acquisition and +create a large-scale dataset including 220K high-quality images of wheat and +maize kernels. Through extensive experiments, AD-GAI achieves considerable +performance in comparison with advanced AD methods, and AI4GrainInsp has highly +consistent performance compared to human experts and excels at inspection +efficiency over 20x speedup. The dataset, code and models will be released at +https://github.com/hellodfan/AI4GrainInsp. + +
+
+ comment: Accepted by ECAI2023. https://github.com/hellodfan/AI4GrainInsp +
+
+
+
+
+ + ☆ SniffyArt: The Dataset of Smelling Persons + + +
+ Smell gestures play a crucial role in the investigation of past smells in the +visual arts yet their automated recognition poses significant challenges. This +paper introduces the SniffyArt dataset, consisting of 1941 individuals +represented in 441 historical artworks. Each person is annotated with a tightly +fitting bounding box, 17 pose keypoints, and a gesture label. By integrating +these annotations, the dataset enables the development of hybrid classification +approaches for smell gesture recognition. The datasets high-quality human pose +estimation keypoints are achieved through the merging of five separate sets of +keypoint annotations per person. The paper also presents a baseline analysis, +evaluating the performance of representative algorithms for detection, keypoint +estimation, and classification tasks, showcasing the potential of combining +keypoint estimation with smell gesture classification. The SniffyArt dataset +lays a solid foundation for future research and the exploration of multi-task +approaches leveraging pose keypoints and person boxes to advance human gesture +and olfactory dimension analysis in historical artworks. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Multi-Task Faces (MTF) Data Set: A Legally and Ethically Compliant + Collection of Face Images for Various Classification Tasks + + +
+ Human facial data hold tremendous potential to address a variety of +classification problems, including face recognition, age estimation, gender +identification, emotion analysis, and race classification. However, recent +privacy regulations, such as the EU General Data Protection Regulation and +others, have restricted the ways in which human images may be collected and +used for research. As a result, several previously published data sets +containing human faces have been removed from the internet due to inadequate +data collection methods that failed to meet privacy regulations. Data sets +consisting of synthetic data have been proposed as an alternative, but they +fall short of accurately representing the real data distribution. On the other +hand, most available data sets are labeled for just a single task, which limits +their applicability. To address these issues, we present the Multi-Task Faces +(MTF) image data set, a meticulously curated collection of face images designed +for various classification tasks, including face recognition, as well as race, +gender, and age classification. The MTF data set has been ethically gathered by +leveraging publicly available images of celebrities and strictly adhering to +copyright regulations. In this paper, we present this data set and provide +detailed descriptions of the followed data collection and processing +procedures. Furthermore, we evaluate the performance of five deep learning (DL) +models on the MTF data set across the aforementioned classification tasks. +Additionally, we compare the performance of DL models over the processed MTF +data and over raw data crawled from the internet. The reported results +constitute a baseline for further research employing these data. The MTF data +set can be accessed through the following link (please cite the present paper +if you use the data set): https://github.com/RamiHaf/MTF_data_set + +
+
+ comment: 21 pages, 2 figures, 9 Tables, +
+
+
+
+
+ + ☆ VLM-Eval: A General Evaluation on Video Large Language Models + + +
+ Despite the rapid development of video Large Language Models (LLMs), a +comprehensive evaluation is still absent. In this paper, we introduce a unified +evaluation that encompasses multiple video tasks, including captioning, +question and answering, retrieval, and action recognition. In addition to +conventional metrics, we showcase how GPT-based evaluation can match human-like +performance in assessing response quality across multiple aspects. We propose a +simple baseline: Video-LLaVA, which uses a single linear projection and +outperforms existing video LLMs. Finally, we evaluate video LLMs beyond +academic datasets, which show encouraging recognition and reasoning +capabilities in driving scenarios with only hundreds of video-instruction pairs +for fine-tuning. We hope our work can serve as a unified evaluation for video +LLMs, and help expand more practical scenarios. The evaluation code will be +available soon. + +
+
+
+
+
+ + ☆ GP-NeRF: Generalized Perception NeRF for Context-Aware 3D Scene + Understanding + + +
+ Applying NeRF to downstream perception tasks for scene understanding and +representation is becoming increasingly popular. Most existing methods treat +semantic prediction as an additional rendering task, \textit{i.e.}, the "label +rendering" task, to build semantic NeRFs. However, by rendering +semantic/instance labels per pixel without considering the contextual +information of the rendered image, these methods usually suffer from unclear +boundary segmentation and abnormal segmentation of pixels within an object. To +solve this problem, we propose Generalized Perception NeRF (GP-NeRF), a novel +pipeline that makes the widely used segmentation model and NeRF work compatibly +under a unified framework, for facilitating context-aware 3D scene perception. +To accomplish this goal, we introduce transformers to aggregate radiance as +well as semantic embedding fields jointly for novel views and facilitate the +joint volumetric rendering of both fields. In addition, we propose two +self-distillation mechanisms, i.e., the Semantic Distill Loss and the +Depth-Guided Semantic Distill Loss, to enhance the discrimination and quality +of the semantic field and the maintenance of geometric consistency. In +evaluation, we conduct experimental comparisons under two perception tasks +(\textit{i.e.} semantic and instance segmentation) using both synthetic and +real-world datasets. Notably, our method outperforms SOTA approaches by 6.94\%, +11.76\%, and 8.47\% on generalized semantic segmentation, finetuning semantic +segmentation, and instance segmentation, respectively. + +
+
+
+
+
+ + ☆ LION : Empowering Multimodal Large Language Model with Dual-Level Visual + Knowledge + + +
+ Multimodal Large Language Models (MLLMs) have endowed LLMs with the ability +to perceive and understand multi-modal signals. However, most of the existing +MLLMs mainly adopt vision encoders pretrained on coarsely aligned image-text +pairs, leading to insufficient extraction and reasoning of visual knowledge. To +address this issue, we devise a dual-Level vIsual knOwledge eNhanced Multimodal +Large Language Model (LION), which empowers the MLLM by injecting visual +knowledge in two levels. 1) Progressive incorporation of fine-grained +spatial-aware visual knowledge. We design a vision aggregator cooperated with +region-level vision-language (VL) tasks to incorporate fine-grained +spatial-aware visual knowledge into the MLLM. To alleviate the conflict between +image-level and region-level VL tasks during incorporation, we devise a +dedicated stage-wise instruction-tuning strategy with mixture-of-adapters. This +progressive incorporation scheme contributes to the mutual promotion between +these two kinds of VL tasks. 2) Soft prompting of high-level semantic visual +evidence. We facilitate the MLLM with high-level semantic visual evidence by +leveraging diverse image tags. To mitigate the potential influence caused by +imperfect predicted tags, we propose a soft prompting method by embedding a +learnable token into the tailored text instruction. Comprehensive experiments +on several multi-modal benchmarks demonstrate the superiority of our model +(e.g., improvement of 5% accuracy on VSR and 3% CIDEr on TextCaps over +InstructBLIP, 5% accuracy on RefCOCOg over Kosmos-2). + +
+
+ comment: Technical Report. Project page: + https://rshaojimmy.github.io/Projects/JiuTian-LION Code: + https://github.com/rshaojimmy/JiuTian +
+
+
+
+
+ + ☆ FATURA: A Multi-Layout Invoice Image Dataset for Document Analysis and + Understanding + + +
+ Document analysis and understanding models often require extensive annotated +data to be trained. However, various document-related tasks extend beyond mere +text transcription, requiring both textual content and precise bounding-box +annotations to identify different document elements. Collecting such data +becomes particularly challenging, especially in the context of invoices, where +privacy concerns add an additional layer of complexity. In this paper, we +introduce FATURA, a pivotal resource for researchers in the field of document +analysis and understanding. FATURA is a highly diverse dataset featuring +multi-layout, annotated invoice document images. Comprising $10,000$ invoices +with $50$ distinct layouts, it represents the largest openly accessible image +dataset of invoice documents known to date. We also provide comprehensive +benchmarks for various document analysis and understanding tasks and conduct +experiments under diverse training and evaluation scenarios. The dataset is +freely accessible at https://zenodo.org/record/8261508, empowering researchers +to advance the field of document analysis and understanding. + +
+
+
+
+
+ + ☆ Asynchronous Bioplausible Neuron for Spiking Neural Networks for + Event-Based Vision + + +
+ Spiking Neural Networks (SNNs) offer a biologically inspired approach to +computer vision that can lead to more efficient processing of visual data with +reduced energy consumption. However, maintaining homeostasis within these +networks is challenging, as it requires continuous adjustment of neural +responses to preserve equilibrium and optimal processing efficiency amidst +diverse and often unpredictable input signals. In response to these challenges, +we propose the Asynchronous Bioplausible Neuron (ABN), a dynamic spike firing +mechanism to auto-adjust the variations in the input signal. Comprehensive +evaluation across various datasets demonstrates ABN's enhanced performance in +image classification and segmentation, maintenance of neural equilibrium, and +energy efficiency. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Entangled View-Epipolar Information Aggregation for Generalizable Neural + Radiance Fields + + +
+ Generalizable NeRF can directly synthesize novel views across new scenes, +eliminating the need for scene-specific retraining in vanilla NeRF. A critical +enabling factor in these approaches is the extraction of a generalizable 3D +representation by aggregating source-view features. In this paper, we propose +an Entangled View-Epipolar Information Aggregation method dubbed EVE-NeRF. +Different from existing methods that consider cross-view and along-epipolar +information independently, EVE-NeRF conducts the view-epipolar feature +aggregation in an entangled manner by injecting the scene-invariant appearance +continuity and geometry consistency priors to the aggregation process. Our +approach effectively mitigates the potential lack of inherent geometric and +appearance constraint resulting from one-dimensional interactions, thus further +boosting the 3D representation generalizablity. EVE-NeRF attains +state-of-the-art performance across various evaluation scenarios. Extensive +experiments demonstate that, compared to prevailing single-dimensional +aggregation, the entangled network excels in the accuracy of 3D scene geometry +and appearance reconstruction.Our project page is +https://github.com/tatakai1/EVENeRF. + +
+
+
+
+
+ + ☆ Kandinsky Conformal Prediction: Efficient Calibration of Image + Segmentation Algorithms + + +
+ Image segmentation algorithms can be understood as a collection of pixel +classifiers, for which the outcomes of nearby pixels are correlated. Classifier +models can be calibrated using Inductive Conformal Prediction, but this +requires holding back a sufficiently large calibration dataset for computing +the distribution of non-conformity scores of the model's predictions. If one +only requires only marginal calibration on the image level, this calibration +set consists of all individual pixels in the images available for calibration. +However, if the goal is to attain proper calibration for each individual pixel +classifier, the calibration set consists of individual images. In a scenario +where data are scarce (such as the medical domain), it may not always be +possible to set aside sufficiently many images for this pixel-level +calibration. The method we propose, dubbed ``Kandinsky calibration'', makes use +of the spatial structure present in the distribution of natural images to +simultaneously calibrate the classifiers of ``similar'' pixels. This can be +seen as an intermediate approach between marginal (imagewise) and conditional +(pixelwise) calibration, where non-conformity scores are aggregated over +similar image regions, thereby making more efficient use of the images +available for calibration. We run experiments on segmentation algorithms +trained and calibrated on subsets of the public MS-COCO and Medical Decathlon +datasets, demonstrating that Kandinsky calibration method can significantly +improve the coverage. When compared to both pixelwise and imagewise calibration +on little data, the Kandinsky method achieves much lower coverage errors, +indicating the data efficiency of the Kandinsky calibration. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ☆ Few-shot Multispectral Segmentation with Representations Generated by + Reinforcement Learning + + +
+ The task of multispectral image segmentation (segmentation of images with +numerous channels/bands, each capturing a specific range of wavelengths of +electromagnetic radiation) has been previously explored in contexts with large +amounts of labeled data. However, these models tend not to generalize well to +datasets of smaller size. In this paper, we propose a novel approach for +improving few-shot segmentation performance on multispectral images using +reinforcement learning to generate representations. These representations are +generated in the form of mathematical expressions between channels and are +tailored to the specific class being segmented. Our methodology involves +training an agent to identify the most informative expressions, updating the +dataset using these expressions, and then using the updated dataset to perform +segmentation. Due to the limited length of the expressions, the model receives +useful representations without any added risk of overfitting. We evaluate the +effectiveness of our approach on several multispectral datasets and demonstrate +its effectiveness in boosting the performance of segmentation algorithms. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Holistic Inverse Rendering of Complex Facade via Aerial 3D Scanning + + +
+ In this work, we use multi-view aerial images to reconstruct the geometry, +lighting, and material of facades using neural signed distance fields (SDFs). +Without the requirement of complex equipment, our method only takes simple RGB +images captured by a drone as inputs to enable physically based and +photorealistic novel-view rendering, relighting, and editing. However, a +real-world facade usually has complex appearances ranging from diffuse rocks +with subtle details to large-area glass windows with specular reflections, +making it hard to attend to everything. As a result, previous methods can +preserve the geometry details but fail to reconstruct smooth glass windows or +verse vise. In order to address this challenge, we introduce three spatial- and +semantic-adaptive optimization strategies, including a semantic regularization +approach based on zero-shot segmentation techniques to improve material +consistency, a frequency-aware geometry regularization to balance surface +smoothness and details in different surfaces, and a visibility probe-based +scheme to enable efficient modeling of the local lighting in large-scale +outdoor environments. In addition, we capture a real-world facade aerial 3D +scanning image set and corresponding point clouds for training and +benchmarking. The experiment demonstrates the superior quality of our method on +facade holistic inverse rendering, novel view synthesis, and scene editing +compared to state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Cross-View Graph Consistency Learning for Invariant Graph + Representations + + +
+ Graph representation learning is fundamental for analyzing graph-structured +data. Exploring invariant graph representations remains a challenge for most +existing graph representation learning methods. In this paper, we propose a +cross-view graph consistency learning (CGCL) method that learns invariant graph +representations for link prediction. First, two complementary augmented views +are derived from an incomplete graph structure through a bidirectional graph +structure augmentation scheme. This augmentation scheme mitigates the potential +information loss that is commonly associated with various data augmentation +techniques involving raw graph data, such as edge perturbation, node removal, +and attribute masking. Second, we propose a CGCL model that can learn invariant +graph representations. A cross-view training scheme is proposed to train the +proposed CGCL model. This scheme attempts to maximize the consistency +information between one augmented view and the graph structure reconstructed +from the other augmented view. Furthermore, we offer a comprehensive +theoretical CGCL analysis. This paper empirically and experimentally +demonstrates the effectiveness of the proposed CGCL method, achieving +competitive results on graph datasets in comparisons with several +state-of-the-art algorithms. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Generalized super-resolution 4D Flow MRI -- using ensemble learning to + extend across the cardiovascular system + + +
+ 4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive +measurement technique capable of quantifying blood flow across the +cardiovascular system. While practical use is limited by spatial resolution and +image noise, incorporation of trained super-resolution (SR) networks has +potential to enhance image quality post-scan. However, these efforts have +predominantly been restricted to narrowly defined cardiovascular domains, with +limited exploration of how SR performance extends across the cardiovascular +system; a task aggravated by contrasting hemodynamic conditions apparent across +the cardiovasculature. The aim of our study was to explore the generalizability +of SR 4D Flow MRI using a combination of heterogeneous training sets and +dedicated ensemble learning. With synthetic training data generated across +three disparate domains (cardiac, aortic, cerebrovascular), varying +convolutional base and ensemble learners were evaluated as a function of domain +and architecture, quantifying performance on both in-silico and acquired +in-vivo data from the same three domains. Results show that both bagging and +stacking ensembling enhance SR performance across domains, accurately +predicting high-resolution velocities from low-resolution input data in-silico. +Likewise, optimized networks successfully recover native resolution velocities +from downsampled in-vivo data, as well as show qualitative potential in +generating denoised SR-images from clinical level input data. In conclusion, +our work presents a viable approach for generalized SR 4D Flow MRI, with +ensemble learning extending utility across various clinical areas of interest. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ CrackCLF: Automatic Pavement Crack Detection based on Closed-Loop + Feedback + + +
+ Automatic pavement crack detection is an important task to ensure the +functional performances of pavements during their service life. Inspired by +deep learning (DL), the encoder-decoder framework is a powerful tool for crack +detection. However, these models are usually open-loop (OL) systems that tend +to treat thin cracks as the background. Meanwhile, these models can not +automatically correct errors in the prediction, nor can it adapt to the changes +of the environment to automatically extract and detect thin cracks. To tackle +this problem, we embed closed-loop feedback (CLF) into the neural network so +that the model could learn to correct errors on its own, based on generative +adversarial networks (GAN). The resulting model is called CrackCLF and includes +the front and back ends, i.e. segmentation and adversarial network. The front +end with U-shape framework is employed to generate crack maps, and the back end +with a multi-scale loss function is used to correct higher-order +inconsistencies between labels and crack maps (generated by the front end) to +address open-loop system issues. Empirical results show that the proposed +CrackCLF outperforms others methods on three public datasets. Moreover, the +proposed CLF can be defined as a plug and play module, which can be embedded +into different neural network models to improve their performances. + +
+
+
+
+
+ + ☆ DocPedia: Unleashing the Power of Large Multimodal Model in the + Frequency Domain for Versatile Document Understanding + + +
+ This work presents DocPedia, a novel large multimodal model (LMM) for +versatile OCR-free document understanding, capable of parsing images up to +2,560$\times$2,560 resolution. Unlike existing work either struggle with +high-resolution documents or give up the large language model thus vision or +language ability constrained, our DocPedia directly processes visual input in +the frequency domain rather than the pixel space. The unique characteristic +enables DocPedia to capture a greater amount of visual and textual information +using a limited number of visual tokens. To consistently enhance both +perception and comprehension abilities of our model, we develop a dual-stage +training strategy and enrich instructions/annotations of all training tasks +covering multiple document types. Extensive quantitative and qualitative +experiments conducted on various publicly available benchmarks confirm the +mutual benefits of jointly learning perception and comprehension tasks. The +results provide further evidence of the effectiveness and superior performance +of our DocPedia over other methods. + +
+
+
+
+
+ + ☆ Robot Hand-Eye Calibration using Structure-from-Motion + + +
+ In this paper we propose a new flexible method for hand-eye calibration. The +vast majority of existing hand-eye calibration techniques requires a +calibration rig which is used in conjunction with camera pose estimation +methods. Instead, we combine structure-from-motion with known robot motions and +we show that the solution can be obtained in linear form. The latter solves for +both the hand-eye parameters and for the unknown scale factor inherent with +structure-from-motion methods. The algebraic analysis that is made possible +with such a linear formulation allows to investigate not only the well known +case of general screw motions but also such singular motions as pure +translations, pure rotations, and planar motions. In essence, the robot-mounted +camera looks to an unknown rigid layout, tracks points over an image sequence +and estimates the camera-to-robot relationship. Such a self calibration process +is relevant for unmanned vehicles, robots working in remote places, and so +forth. We conduct a large number of experiments which validate the quality of +the method by comparing it with existing ones. + +
+
+
+
+
+ + ☆ Igniting Language Intelligence: The Hitchhiker's Guide From + Chain-of-Thought Reasoning to Language Agents + + +
+ Large language models (LLMs) have dramatically enhanced the field of language +intelligence, as demonstrably evidenced by their formidable empirical +performance across a spectrum of complex reasoning tasks. Additionally, +theoretical proofs have illuminated their emergent reasoning capabilities, +providing a compelling showcase of their advanced cognitive abilities in +linguistic contexts. Critical to their remarkable efficacy in handling complex +reasoning tasks, LLMs leverage the intriguing chain-of-thought (CoT) reasoning +techniques, obliging them to formulate intermediate steps en route to deriving +an answer. The CoT reasoning approach has not only exhibited proficiency in +amplifying reasoning performance but also in enhancing interpretability, +controllability, and flexibility. In light of these merits, recent research +endeavors have extended CoT reasoning methodologies to nurture the development +of autonomous language agents, which adeptly adhere to language instructions +and execute actions within varied environments. This survey paper orchestrates +a thorough discourse, penetrating vital research dimensions, encompassing: (i) +the foundational mechanics of CoT techniques, with a focus on elucidating the +circumstances and justification behind its efficacy; (ii) the paradigm shift in +CoT; and (iii) the burgeoning of language agents fortified by CoT approaches. +Prospective research avenues envelop explorations into generalization, +efficiency, customization, scaling, and safety. This paper caters to a wide +audience, including beginners seeking comprehensive knowledge of CoT reasoning +and language agents, as well as experienced researchers interested in +foundational mechanics and engaging in cutting-edge discussions on these +topics. A repository for the related papers is available at +https://github.com/Zoeyyao27/CoT-Igniting-Agent. + +
+
+
+
+
+ + ☆ Beyond Boundaries: A Comprehensive Survey of Transferable Attacks on AI + Systems + + +
+ Artificial Intelligence (AI) systems such as autonomous vehicles, facial +recognition, and speech recognition systems are increasingly integrated into +our daily lives. However, despite their utility, these AI systems are +vulnerable to a wide range of attacks such as adversarial, backdoor, data +poisoning, membership inference, model inversion, and model stealing attacks. +In particular, numerous attacks are designed to target a particular model or +system, yet their effects can spread to additional targets, referred to as +transferable attacks. Although considerable efforts have been directed toward +developing transferable attacks, a holistic understanding of the advancements +in transferable attacks remains elusive. In this paper, we comprehensively +explore learning-based attacks from the perspective of transferability, +particularly within the context of cyber-physical security. We delve into +different domains -- the image, text, graph, audio, and video domains -- to +highlight the ubiquitous and pervasive nature of transferable attacks. This +paper categorizes and reviews the architecture of existing attacks from various +viewpoints: data, process, model, and system. We further examine the +implications of transferable attacks in practical scenarios such as autonomous +driving, speech recognition, and large language models (LLMs). Additionally, we +outline the potential research directions to encourage efforts in exploring the +landscape of transferable attacks. This survey offers a holistic understanding +of the prevailing transferable attacks and their impacts across different +domains. + +
+
+
+
+
+ + ☆ Robust Tumor Segmentation with Hyperspectral Imaging and Graph Neural + Networks + + +
+ Segmenting the boundary between tumor and healthy tissue during surgical +cancer resection poses a significant challenge. In recent years, Hyperspectral +Imaging (HSI) combined with Machine Learning (ML) has emerged as a promising +solution. However, due to the extensive information contained within the +spectral domain, most ML approaches primarily classify individual HSI +(super-)pixels, or tiles, without taking into account their spatial context. In +this paper, we propose an improved methodology that leverages the spatial +context of tiles for more robust and smoother segmentation. To address the +irregular shapes of tiles, we utilize Graph Neural Networks (GNNs) to propagate +context information across neighboring regions. The features for each tile +within the graph are extracted using a Convolutional Neural Network (CNN), +which is trained simultaneously with the subsequent GNN. Moreover, we +incorporate local image quality metrics into the loss function to enhance the +training procedure's robustness against low-quality regions in the training +images. We demonstrate the superiority of our proposed method using a clinical +ex vivo dataset consisting of 51 HSI images from 30 patients. Despite the +limited dataset, the GNN-based model significantly outperforms context-agnostic +approaches, accurately distinguishing between healthy and tumor tissues, even +in images from previously unseen patients. Furthermore, we show that our +carefully designed loss function, accounting for local image quality, results +in additional improvements. Our findings demonstrate that context-aware GNN +algorithms can robustly find tumor demarcations on HSI images, ultimately +contributing to better surgery success and patient outcome. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Multimodal deep learning for mapping forest dominant height by fusing + GEDI with earth observation data + + +
+ The integration of multisource remote sensing data and deep learning models +offers new possibilities for accurately mapping high spatial resolution forest +height. We found that GEDI relative heights (RH) metrics exhibited strong +correlation with the mean of the top 10 highest trees (dominant height) +measured in situ at the corresponding footprint locations. Consequently, we +proposed a novel deep learning framework termed the multi-modal attention +remote sensing network (MARSNet) to estimate forest dominant height by +extrapolating dominant height derived from GEDI, using Setinel-1 data, ALOS-2 +PALSAR-2 data, Sentinel-2 optical data and ancillary data. MARSNet comprises +separate encoders for each remote sensing data modality to extract multi-scale +features, and a shared decoder to fuse the features and estimate height. Using +individual encoders for each remote sensing imagery avoids interference across +modalities and extracts distinct representations. To focus on the efficacious +information from each dataset, we reduced the prevalent spatial and band +redundancies in each remote sensing data by incorporating the extended spatial +and band reconstruction convolution modules in the encoders. MARSNet achieved +commendable performance in estimating dominant height, with an R2 of 0.62 and +RMSE of 2.82 m, outperforming the widely used random forest approach which +attained an R2 of 0.55 and RMSE of 3.05 m. Finally, we applied the trained +MARSNet model to generate wall-to-wall maps at 10 m resolution for Jilin, +China. Through independent validation using field measurements, MARSNet +demonstrated an R2 of 0.58 and RMSE of 3.76 m, compared to 0.41 and 4.37 m for +the random forest baseline. Our research demonstrates the effectiveness of a +multimodal deep learning approach fusing GEDI with SAR and passive optical +imagery for enhancing the accuracy of high resolution dominant height +estimation. + +
+
+
+
+
+ + ☆ Practical cross-sensor color constancy using a dual-mapping strategy + + +
+ Deep Neural Networks (DNNs) have been widely used for illumination +estimation, which is time-consuming and requires sensor-specific data +collection. Our proposed method uses a dual-mapping strategy and only requires +a simple white point from a test sensor under a D65 condition. This allows us +to derive a mapping matrix, enabling the reconstructions of image data and +illuminants. In the second mapping phase, we transform the re-constructed image +data into sparse features, which are then optimized with a lightweight +multi-layer perceptron (MLP) model using the re-constructed illuminants as +ground truths. This approach effectively reduces sensor discrepancies and +delivers performance on par with leading cross-sensor methods. It only requires +a small amount of memory (~0.003 MB), and takes ~1 hour training on an +RTX3070Ti GPU. More importantly, the method can be implemented very fast, with +~0.3 ms and ~1 ms on a GPU or CPU respectively, and is not sensitive to the +input image resolution. Therefore, it offers a practical solution to the great +challenges of data recollection that is faced by the industry. + +
+
+
+
+
+ + ☆ A Good Feature Extractor Is All You Need for Weakly Supervised Learning + in Histopathology + + +
+ Deep learning is revolutionising pathology, offering novel opportunities in +disease prognosis and personalised treatment. Historically, stain normalisation +has been a crucial preprocessing step in computational pathology pipelines, and +persists into the deep learning era. Yet, with the emergence of feature +extractors trained using self-supervised learning (SSL) on diverse pathology +datasets, we call this practice into question. In an empirical evaluation of +publicly available feature extractors, we find that omitting stain +normalisation and image augmentations does not compromise downstream +performance, while incurring substantial savings in memory and compute. +Further, we show that the top-performing feature extractors are remarkably +robust to variations in stain and augmentations like rotation in their latent +space. Contrary to previous patch-level benchmarking studies, our approach +emphasises clinical relevance by focusing on slide-level prediction tasks in a +weakly supervised setting with external validation cohorts. This work +represents the most comprehensive robustness evaluation of public pathology SSL +feature extractors to date, involving more than 6,000 training runs across nine +tasks, five datasets, three downstream architectures, and various preprocessing +setups. Our findings stand to streamline digital pathology workflows by +minimising preprocessing needs and informing the selection of feature +extractors. + +
+
+
+
+
+ + ☆ Non-Contact NIR PPG Sensing through Large Sequence Signal Regression + + +
+ Non-Contact sensing is an emerging technology with applications across many +industries from driver monitoring in vehicles to patient monitoring in +healthcare. Current state-of-the-art implementations focus on RGB video, but +this struggles in varying/noisy light conditions and is almost completely +unfeasible in the dark. Near Infra-Red (NIR) video, however, does not suffer +from these constraints. This paper aims to demonstrate the effectiveness of an +alternative Convolution Attention Network (CAN) architecture, to regress +photoplethysmography (PPG) signal from a sequence of NIR frames. A combination +of two publicly available datasets, which is split into train and test sets, is +used for training the CAN. This combined dataset is augmented to reduce +overfitting to the 'normal' 60 - 80 bpm heart rate range by providing the full +range of heart rates along with corresponding videos for each subject. This +CAN, when implemented over video cropped to the subject's head, achieved a Mean +Average Error (MAE) of just 0.99 bpm, proving its effectiveness on NIR video +and the architecture's feasibility to regress an accurate signal output. + +
+
+ comment: 4 pages, 3 figures, 3 tables, Irish Machine Vision and Image + Processing Conference 2023 +
+
+
+
+
+ + ☆ A Large-Scale Car Parts (LSCP) Dataset for Lightweight Fine-Grained + Detection + + +
+ Automotive related datasets have previously been used for training autonomous +driving systems or vehicle classification tasks. However, there is a lack of +datasets in the field of automotive AI for car parts detection, and most +available datasets are limited in size and scope, struggling to cover diverse +scenarios. To address this gap, this paper presents a large-scale and +fine-grained automotive dataset consisting of 84,162 images for detecting 12 +different types of car parts. This dataset was collected from natural cameras +and online websites which covers various car brands, scenarios, and shooting +angles. To alleviate the burden of manual annotation, we propose a novel +semi-supervised auto-labeling method that leverages state-of-the-art +pre-trained detectors. Moreover, we study the limitations of the Grounding DINO +approach for zero-shot labeling. Finally, we evaluate the effectiveness of our +proposed dataset through fine-grained car parts detection by training several +lightweight YOLO-series detectors. + +
+
+
+
+
+ + ☆ AdvGen: Physical Adversarial Attack on Face Presentation Attack + Detection Systems + + +
+ Evaluating the risk level of adversarial images is essential for safely +deploying face authentication models in the real world. Popular approaches for +physical-world attacks, such as print or replay attacks, suffer from some +limitations, like including physical and geometrical artifacts. Recently, +adversarial attacks have gained attraction, which try to digitally deceive the +learning strategy of a recognition system using slight modifications to the +captured image. While most previous research assumes that the adversarial image +could be digitally fed into the authentication systems, this is not always the +case for systems deployed in the real world. This paper demonstrates the +vulnerability of face authentication systems to adversarial images in physical +world scenarios. We propose AdvGen, an automated Generative Adversarial +Network, to simulate print and replay attacks and generate adversarial images +that can fool state-of-the-art PADs in a physical domain attack setting. Using +this attack strategy, the attack success rate goes up to 82.01%. We test AdvGen +extensively on four datasets and ten state-of-the-art PADs. We also demonstrate +the effectiveness of our attack by conducting experiments in a realistic, +physical environment. + +
+
+ comment: 10 pages, 9 figures, Accepted to the International Joint Conference + on Biometrics (IJCB 2023) +
+
+
+
+
+ + ☆ Fuzzy Information Seeded Region Growing for Automated Lesions After + Stroke Segmentation in MR Brain Images + + +
+ In the realm of medical imaging, precise segmentation of stroke lesions from +brain MRI images stands as a critical challenge with significant implications +for patient diagnosis and treatment. Addressing this, our study introduces an +innovative approach using a Fuzzy Information Seeded Region Growing (FISRG) +algorithm. Designed to effectively delineate the complex and irregular +boundaries of stroke lesions, the FISRG algorithm combines fuzzy logic with +Seeded Region Growing (SRG) techniques, aiming to enhance segmentation +accuracy. + The research involved three experiments to optimize the FISRG algorithm's +performance, each focusing on different parameters to improve the accuracy of +stroke lesion segmentation. The highest Dice score achieved in these +experiments was 94.2\%, indicating a high degree of similarity between the +algorithm's output and the expert-validated ground truth. Notably, the best +average Dice score, amounting to 88.1\%, was recorded in the third experiment, +highlighting the efficacy of the algorithm in consistently segmenting stroke +lesions across various slices. + Our findings reveal the FISRG algorithm's strengths in handling the +heterogeneity of stroke lesions. However, challenges remain in areas of abrupt +lesion topology changes and in distinguishing lesions from similar intensity +brain regions. The results underscore the potential of the FISRG algorithm in +contributing significantly to advancements in medical imaging analysis for +stroke diagnosis and treatment. + +
+
+ comment: 10 pages, 14 figures. Associated code and data available at: + https://github.com/Mawio02/FISRG-for-Automated-Lesion-After-Stroke-Segmentation-in-MRI +
+
+
+
+
+ + ☆ Sparse4D v3: Advancing End-to-End 3D Detection and Tracking + + +
+ In autonomous driving perception systems, 3D detection and tracking are the +two fundamental tasks. This paper delves deeper into this field, building upon +the Sparse4D framework. We introduce two auxiliary training tasks (Temporal +Instance Denoising and Quality Estimation) and propose decoupled attention to +make structural improvements, leading to significant enhancements in detection +performance. Additionally, we extend the detector into a tracker using a +straightforward approach that assigns instance ID during inference, further +highlighting the advantages of query-based algorithms. Extensive experiments +conducted on the nuScenes benchmark validate the effectiveness of the proposed +improvements. With ResNet50 as the backbone, we witnessed enhancements of +3.0\%, 2.2\%, and 7.6\% in mAP, NDS, and AMOTA, achieving 46.9\%, 56.1\%, and +49.0\%, respectively. Our best model achieved 71.9\% NDS and 67.7\% AMOTA on +the nuScenes test set. Code will be released at +\url{https://github.com/linxuewu/Sparse4D}. + +
+
+
+
+
+ + ☆ On the Importance of Large Objects in CNN Based Object Detection + Algorithms + + +
+ Object detection models, a prominent class of machine learning algorithms, +aim to identify and precisely locate objects in images or videos. However, this +task might yield uneven performances sometimes caused by the objects sizes and +the quality of the images and labels used for training. In this paper, we +highlight the importance of large objects in learning features that are +critical for all sizes. Given these findings, we propose to introduce a +weighting term into the training loss. This term is a function of the object +area size. We show that giving more weight to large objects leads to improved +detection scores across all object sizes and so an overall improvement in +Object Detectors performances (+2 p.p. of mAP on small objects, +2 p.p. on +medium and +4 p.p. on large on COCO val 2017 with InternImage-T). Additional +experiments and ablation studies with different models and on a different +dataset further confirm the robustness of our findings. + +
+
+
+
+
+ + ☆ GS-SLAM: Dense Visual SLAM with 3D Gaussian Splatting + + +
+ In this paper, we introduce $\textbf{GS-SLAM}$ that first utilizes 3D +Gaussian representation in the Simultaneous Localization and Mapping (SLAM) +system. It facilitates a better balance between efficiency and accuracy. +Compared to recent SLAM methods employing neural implicit representations, our +method utilizes a real-time differentiable splatting rendering pipeline that +offers significant speedup to map optimization and RGB-D re-rendering. +Specifically, we propose an adaptive expansion strategy that adds new or +deletes noisy 3D Gaussian in order to efficiently reconstruct new observed +scene geometry and improve the mapping of previously observed areas. This +strategy is essential to extend 3D Gaussian representation to reconstruct the +whole scene rather than synthesize a static object in existing methods. +Moreover, in the pose tracking process, an effective coarse-to-fine technique +is designed to select reliable 3D Gaussian representations to optimize camera +pose, resulting in runtime reduction and robust estimation. Our method achieves +competitive performance compared with existing state-of-the-art real-time +methods on the Replica, TUM-RGBD datasets. The source code will be released +upon acceptance. + +
+
+
+
+
+ + ☆ Cut-and-Paste: Subject-Driven Video Editing with Attention Control + + +
+ This paper presents a novel framework termed Cut-and-Paste for real-word +semantic video editing under the guidance of text prompt and additional +reference image. While the text-driven video editing has demonstrated +remarkable ability to generate highly diverse videos following given text +prompts, the fine-grained semantic edits are hard to control by plain textual +prompt only in terms of object details and edited region, and cumbersome long +text descriptions are usually needed for the task. We therefore investigate +subject-driven video editing for more precise control of both edited regions +and background preservation, and fine-grained semantic generation. We achieve +this goal by introducing an reference image as supplementary input to the +text-driven video editing, which avoids racking your brain to come up with a +cumbersome text prompt describing the detailed appearance of the object. To +limit the editing area, we refer to a method of cross attention control in +image editing and successfully extend it to video editing by fusing the +attention map of adjacent frames, which strikes a balance between maintaining +video background and spatio-temporal consistency. Compared with current +methods, the whole process of our method is like ``cut" the source object to be +edited and then ``paste" the target object provided by reference image. We +demonstrate that our method performs favorably over prior arts for video +editing under the guidance of text prompt and extra reference image, as +measured by both quantitative and subjective evaluations. + +
+
+
+
+
+ + ☆ Clarity ChatGPT: An Interactive and Adaptive Processing System for Image + Restoration and Enhancement + + +
+ The generalization capability of existing image restoration and enhancement +(IRE) methods is constrained by the limited pre-trained datasets, making it +difficult to handle agnostic inputs such as different degradation levels and +scenarios beyond their design scopes. Moreover, they are not equipped with +interactive mechanisms to consider user preferences or feedback, and their +end-to-end settings cannot provide users with more choices. Faced with the +above-mentioned IRE method's limited performance and insufficient +interactivity, we try to solve it from the engineering and system framework +levels. Specifically, we propose Clarity ChatGPT-a transformative system that +combines the conversational intelligence of ChatGPT with multiple IRE methods. +Clarity ChatGPT can automatically detect image degradation types and select +appropriate IRE methods to restore images, or iteratively generate satisfactory +results based on user feedback. Its innovative features include a CLIP-powered +detector for accurate degradation classification, no-reference image quality +evaluation for performance evaluation, region-specific processing for precise +enhancements, and advanced fusion techniques for optimal restoration results. +Clarity ChatGPT marks a significant advancement in integrating language and +vision, enhancing image-text interactions, and providing a robust, +high-performance IRE solution. Our case studies demonstrate that Clarity +ChatGPT effectively improves the generalization and interaction capabilities in +the IRE, and also fills the gap in the low-level domain of the existing +vision-language model. + +
+
+
+
+
+ + ☆ Segment Together: A Versatile Paradigm for Semi-Supervised Medical Image + Segmentation + + +
+ Annotation scarcity has become a major obstacle for training powerful +deep-learning models for medical image segmentation, restricting their +deployment in clinical scenarios. To address it, semi-supervised learning by +exploiting abundant unlabeled data is highly desirable to boost the model +training. However, most existing works still focus on limited medical tasks and +underestimate the potential of learning across diverse tasks and multiple +datasets. Therefore, in this paper, we introduce a \textbf{Ver}satile +\textbf{Semi}-supervised framework (VerSemi) to point out a new perspective +that integrates various tasks into a unified model with a broad label space, to +exploit more unlabeled data for semi-supervised medical image segmentation. +Specifically, we introduce a dynamic task-prompted design to segment various +targets from different datasets. Next, this unified model is used to identify +the foreground regions from all labeled data, to capture cross-dataset +semantics. Particularly, we create a synthetic task with a cutmix strategy to +augment foreground targets within the expanded label space. To effectively +utilize unlabeled data, we introduce a consistency constraint. This involves +aligning aggregated predictions from various tasks with those from the +synthetic task, further guiding the model in accurately segmenting foreground +regions during training. We evaluated our VerSemi model on four public +benchmarking datasets. Extensive experiments demonstrated that VerSemi can +consistently outperform the second-best method by a large margin (e.g., an +average 2.69\% Dice gain on four datasets), setting new SOTA performance for +semi-supervised medical image segmentation. The code will be released. + +
+
+
+
+
+ + ☆ ViP-Mixer: A Convolutional Mixer for Video Prediction + + +
+ Video prediction aims to predict future frames from a video's previous +content. Existing methods mainly process video data where the time dimension +mingles with the space and channel dimensions from three distinct angles: as a +sequence of individual frames, as a 3D volume in spatiotemporal coordinates, or +as a stacked image where frames are treated as separate channels. Most of them +generally focus on one of these perspectives and may fail to fully exploit the +relationships across different dimensions. To address this issue, this paper +introduces a convolutional mixer for video prediction, termed ViP-Mixer, to +model the spatiotemporal evolution in the latent space of an autoencoder. The +ViP-Mixers are stacked sequentially and interleave feature mixing at three +levels: frames, channels, and locations. Extensive experiments demonstrate that +our proposed method achieves new state-of-the-art prediction performance on +three benchmark video datasets covering both synthetic and real-world +scenarios. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ PMP-Swin: Multi-Scale Patch Message Passing Swin Transformer for Retinal + Disease Classification + + +
+ Retinal disease is one of the primary causes of visual impairment, and early +diagnosis is essential for preventing further deterioration. Nowadays, many +works have explored Transformers for diagnosing diseases due to their strong +visual representation capabilities. However, retinal diseases exhibit milder +forms and often present with overlapping signs, which pose great difficulties +for accurate multi-class classification. Therefore, we propose a new framework +named Multi-Scale Patch Message Passing Swin Transformer for multi-class +retinal disease classification. Specifically, we design a Patch Message Passing +(PMP) module based on the Message Passing mechanism to establish global +interaction for pathological semantic features and to exploit the subtle +differences further between different diseases. Moreover, considering the +various scale of pathological features we integrate multiple PMP modules for +different patch sizes. For evaluation, we have constructed a new dataset, named +OPTOS dataset, consisting of 1,033 high-resolution fundus images photographed +by Optos camera and conducted comprehensive experiments to validate the +efficacy of our proposed method. And the results on both the public dataset and +our dataset demonstrate that our method achieves remarkable performance +compared to state-of-the-art methods. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ OmniSeg3D: Omniversal 3D Segmentation via Hierarchical Contrastive + Learning + + +
+ Towards holistic understanding of 3D scenes, a general 3D segmentation method +is needed that can segment diverse objects without restrictions on object +quantity or categories, while also reflecting the inherent hierarchical +structure. To achieve this, we propose OmniSeg3D, an omniversal segmentation +method aims for segmenting anything in 3D all at once. The key insight is to +lift multi-view inconsistent 2D segmentations into a consistent 3D feature +field through a hierarchical contrastive learning framework, which is +accomplished by two steps. Firstly, we design a novel hierarchical +representation based on category-agnostic 2D segmentations to model the +multi-level relationship among pixels. Secondly, image features rendered from +the 3D feature field are clustered at different levels, which can be further +drawn closer or pushed apart according to the hierarchical relationship between +different levels. In tackling the challenges posed by inconsistent 2D +segmentations, this framework yields a global consistent 3D feature field, +which further enables hierarchical segmentation, multi-object selection, and +global discretization. Extensive experiments demonstrate the effectiveness of +our method on high-quality 3D segmentation and accurate hierarchical structure +understanding. A graphical user interface further facilitates flexible +interaction for omniversal 3D segmentation. + +
+
+
+
+
+ + ☆ Enhanced Spatio-Temporal Context for Temporally Consistent Robust 3D + Human Motion Recovery from Monocular Videos + + +
+ Recovering temporally consistent 3D human body pose, shape and motion from a +monocular video is a challenging task due to (self-)occlusions, poor lighting +conditions, complex articulated body poses, depth ambiguity, and limited +availability of annotated data. Further, doing a simple perframe estimation is +insufficient as it leads to jittery and implausible results. In this paper, we +propose a novel method for temporally consistent motion estimation from a +monocular video. Instead of using generic ResNet-like features, our method uses +a body-aware feature representation and an independent per-frame pose and +camera initialization over a temporal window followed by a novel +spatio-temporal feature aggregation by using a combination of self-similarity +and self-attention over the body-aware features and the perframe +initialization. Together, they yield enhanced spatiotemporal context for every +frame by considering remaining past and future frames. These features are used +to predict the pose and shape parameters of the human body model, which are +further refined using an LSTM. Experimental results on the publicly available +benchmark data show that our method attains significantly lower acceleration +error and outperforms the existing state-of-the-art methods over all key +quantitative evaluation metrics, including complex scenarios like partial +occlusion, complex poses and even relatively low illumination. + +
+
+
+
+
+ + ☆ MGCT: Mutual-Guided Cross-Modality Transformer for Survival Outcome + Prediction using Integrative Histopathology-Genomic Features + + +
+ The rapidly emerging field of deep learning-based computational pathology has +shown promising results in utilizing whole slide images (WSIs) to objectively +prognosticate cancer patients. However, most prognostic methods are currently +limited to either histopathology or genomics alone, which inevitably reduces +their potential to accurately predict patient prognosis. Whereas integrating +WSIs and genomic features presents three main challenges: (1) the enormous +heterogeneity of gigapixel WSIs which can reach sizes as large as +150,000x150,000 pixels; (2) the absence of a spatially corresponding +relationship between histopathology images and genomic molecular data; and (3) +the existing early, late, and intermediate multimodal feature fusion strategies +struggle to capture the explicit interactions between WSIs and genomics. To +ameliorate these issues, we propose the Mutual-Guided Cross-Modality +Transformer (MGCT), a weakly-supervised, attention-based multimodal learning +framework that can combine histology features and genomic features to model the +genotype-phenotype interactions within the tumor microenvironment. To validate +the effectiveness of MGCT, we conduct experiments using nearly 3,600 gigapixel +WSIs across five different cancer types sourced from The Cancer Genome Atlas +(TCGA). Extensive experimental results consistently emphasize that MGCT +outperforms the state-of-the-art (SOTA) methods. + +
+
+ comment: 7 pages, 4 figures, accepted by 2023 IEEE International Conference on + Bioinformatics and Biomedicine (BIBM 2023) +
+
+
+
+
+ + ☆ Double-Condensing Attention Condenser: Leveraging Attention in Deep + Learning to Detect Skin Cancer from Skin Lesion Images + + +
+ Skin cancer is the most common type of cancer in the United States and is +estimated to affect one in five Americans. Recent advances have demonstrated +strong performance on skin cancer detection, as exemplified by state of the art +performance in the SIIM-ISIC Melanoma Classification Challenge; however these +solutions leverage ensembles of complex deep neural architectures requiring +immense storage and compute costs, and therefore may not be tractable. A recent +movement for TinyML applications is integrating Double-Condensing Attention +Condensers (DC-AC) into a self-attention neural network backbone architecture +to allow for faster and more efficient computation. This paper explores +leveraging an efficient self-attention structure to detect skin cancer in skin +lesion images and introduces a deep neural network design with DC-AC customized +for skin cancer detection from skin lesion images. The final model is publicly +available as a part of a global open-source initiative dedicated to +accelerating advancement in machine learning to aid clinicians in the fight +against cancer. + +
+
+
+
+
+ + ☆ Cancer-Net PCa-Data: An Open-Source Benchmark Dataset for Prostate + Cancer Clinical Decision Support using Synthetic Correlated Diffusion Imaging + Data + + +
+ The recent introduction of synthetic correlated diffusion (CDI$^s$) imaging +has demonstrated significant potential in the realm of clinical decision +support for prostate cancer (PCa). CDI$^s$ is a new form of magnetic resonance +imaging (MRI) designed to characterize tissue characteristics through the joint +correlation of diffusion signal attenuation across different Brownian motion +sensitivities. Despite the performance improvement, the CDI$^s$ data for PCa +has not been previously made publicly available. In our commitment to advance +research efforts for PCa, we introduce Cancer-Net PCa-Data, an open-source +benchmark dataset of volumetric CDI$^s$ imaging data of PCa patients. +Cancer-Net PCa-Data consists of CDI$^s$ volumetric images from a patient cohort +of 200 patient cases, along with full annotations (gland masks, tumor masks, +and PCa diagnosis for each tumor). We also analyze the demographic and label +region diversity of Cancer-Net PCa-Data for potential biases. Cancer-Net +PCa-Data is the first-ever public dataset of CDI$^s$ imaging data for PCa, and +is a part of the global open-source initiative dedicated to advancement in +machine learning and imaging research to aid clinicians in the global fight +against cancer. + +
+
+
+
+
+ + ☆ CastDet: Toward Open Vocabulary Aerial Object Detection with + CLIP-Activated Student-Teacher Learning + + +
+ Object detection in aerial images is a pivotal task for various earth +observation applications, whereas current algorithms learn to detect only a +pre-defined set of object categories demanding sufficient bounding-box +annotated training samples and fail to detect novel object categories. In this +paper, we consider open-vocabulary object detection (OVD) in aerial images that +enables the characterization of new objects beyond training categories on the +earth surface without annotating training images for these new categories. The +performance of OVD depends on the quality of class-agnostic region proposals +and pseudo-labels that can generalize well to novel object categories. To +simultaneously generate high-quality proposals and pseudo-labels, we propose +CastDet, a CLIP-activated student-teacher open-vocabulary object Detection +framework. Our end-to-end framework within the student-teacher mechanism +employs the CLIP model as an extra omniscient teacher of rich knowledge into +the student-teacher self-learning process. By doing so, our approach boosts +novel object proposals and classification. Furthermore, we design a dynamic +label queue technique to maintain high-quality pseudo labels during batch +training and mitigate label imbalance. We conduct extensive experiments on +multiple existing aerial object detection datasets, which are set up for the +OVD task. Experimental results demonstrate our CastDet achieving superior +open-vocabulary detection performance, e.g., reaching 40.0 HM (Harmonic Mean), +which outperforms previous methods Detic/ViLD by 26.9/21.1 on the VisDroneZSD +dataset. + +
+
+
+
+
+ + ☆ Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging + + +
+ Video face re-aging deals with altering the apparent age of a person to the +target age in videos. This problem is challenging due to the lack of paired +video datasets maintaining temporal consistency in identity and age. Most +re-aging methods process each image individually without considering the +temporal consistency of videos. While some existing works address the issue of +temporal coherence through video facial attribute manipulation in latent space, +they often fail to deliver satisfactory performance in age transformation. To +tackle the issues, we propose (1) a novel synthetic video dataset that features +subjects across a diverse range of age groups; (2) a baseline architecture +designed to validate the effectiveness of our proposed dataset, and (3) the +development of three novel metrics tailored explicitly for evaluating the +temporal consistency of video re-aging techniques. Our comprehensive +experiments on public datasets, such as VFHQ and CelebV-HQ, show that our +method outperforms the existing approaches in terms of both age transformation +and temporal consistency. + +
+
+ comment: 8 pages, 6 figures, 4 tables +
+
+
+
+
+ + ☆ Reti-Diff: Illumination Degradation Image Restoration with Retinex-based + Latent Diffusion Model + + +
+ Illumination degradation image restoration (IDIR) techniques aim to improve +the visibility of degraded images and mitigate the adverse effects of +deteriorated illumination. Among these algorithms, diffusion model (DM)-based +methods have shown promising performance but are often burdened by heavy +computational demands and pixel misalignment issues when predicting the +image-level distribution. To tackle these problems, we propose to leverage DM +within a compact latent space to generate concise guidance priors and introduce +a novel solution called Reti-Diff for the IDIR task. Reti-Diff comprises two +key components: the Retinex-based latent DM (RLDM) and the Retinex-guided +transformer (RGformer). To ensure detailed reconstruction and illumination +correction, RLDM is empowered to acquire Retinex knowledge and extract +reflectance and illumination priors. These priors are subsequently utilized by +RGformer to guide the decomposition of image features into their respective +reflectance and illumination components. Following this, RGformer further +enhances and consolidates the decomposed features, resulting in the production +of refined images with consistent content and robustness to handle complex +degradation scenarios. Extensive experiments show that Reti-Diff outperforms +existing methods on three IDIR tasks, as well as downstream applications. Code +will be available at \url{https://github.com/ChunmingHe/Reti-Diff}. + +
+
+ comment: 12 pages, 6 figures, 9 tables +
+
+
+
+
+ + ☆ Generating Realistic Counterfactuals for Retinal Fundus and OCT Images + using Diffusion Models + + +
+ Counterfactual reasoning is often used in a clinical setting to explain +decisions or weigh alternatives. Therefore, for imaging based modalities such +as ophthalmology, it would be beneficial to be able to create counterfactual +images, illustrating the answer to the question: "If the subject had had +diabetic retinopathy, how would the fundus image have looked?" Here, we +demonstrate that using a diffusion model in combination with an adversarially +robust classifier trained on retinal disease classification tasks enables +generation of highly realistic counterfactuals of retinal fundus images and +optical coherence tomorgraphy (OCT) B-scans. Ideally, these classifiers encode +the salient features indicative for each disease class and can steer the +diffusion model to show realistic disease signs or remove disease-related +lesions in a realistic way. Importantly, in a user study, domain experts found +the counterfactuals generated using our method significantly more realistic +than counterfactuals generated from a previous method, and even +indistiguishable from realistic images. + +
+
+
+
+
+ + ☆ Semantic-Preserved Point-based Human Avatar + + +
+ To enable realistic experience in AR/VR and digital entertainment, we present +the first point-based human avatar model that embodies the entirety expressive +range of digital humans. We employ two MLPs to model pose-dependent deformation +and linear skinning (LBS) weights. The representation of appearance relies on a +decoder and the features that attached to each point. In contrast to +alternative implicit approaches, the oriented points representation not only +provides a more intuitive way to model human avatar animation but also +significantly reduces both training and inference time. Moreover, we propose a +novel method to transfer semantic information from the SMPL-X model to the +points, which enables to better understand human body movements. By leveraging +the semantic information of points, we can facilitate virtual try-on and human +avatar composition through exchanging the points of same category across +different subjects. Experimental results demonstrate the efficacy of our +presented method. + +
+
+
+
+
+ + ☆ CurriculumLoc: Enhancing Cross-Domain Geolocalization through + Multi-Stage Refinement + + +
+ Visual geolocalization is a cost-effective and scalable task that involves +matching one or more query images, taken at some unknown location, to a set of +geo-tagged reference images. Existing methods, devoted to semantic features +representation, evolving towards robustness to a wide variety between query and +reference, including illumination and viewpoint changes, as well as scale and +seasonal variations. However, practical visual geolocalization approaches need +to be robust in appearance changing and extreme viewpoint variation conditions, +while providing accurate global location estimates. Therefore, inspired by +curriculum design, human learn general knowledge first and then delve into +professional expertise. We first recognize semantic scene and then measure +geometric structure. Our approach, termed CurriculumLoc, involves a delicate +design of multi-stage refinement pipeline and a novel keypoint detection and +description with global semantic awareness and local geometric verification. We +rerank candidates and solve a particular cross-domain perspective-n-point (PnP) +problem based on these keypoints and corresponding descriptors, position +refinement occurs incrementally. The extensive experimental results on our +collected dataset, TerraTrack and a benchmark dataset, ALTO, demonstrate that +our approach results in the aforementioned desirable characteristics of a +practical visual geolocalization solution. Additionally, we achieve new high +recall@1 scores of 62.6% and 94.5% on ALTO, with two different distances +metrics, respectively. Dataset, code and trained models are publicly available +on https://github.com/npupilab/CurriculumLoc. + +
+
+ comment: 14 pages, 15 figures +
+
+
+
+
+ + ☆ A Multi-In-Single-Out Network for Video Frame Interpolation without + Optical Flow + + +
+ In general, deep learning-based video frame interpolation (VFI) methods have +predominantly focused on estimating motion vectors between two input frames and +warping them to the target time. While this approach has shown impressive +performance for linear motion between two input frames, it exhibits limitations +when dealing with occlusions and nonlinear movements. Recently, generative +models have been applied to VFI to address these issues. However, as VFI is not +a task focused on generating plausible images, but rather on predicting +accurate intermediate frames between two given frames, performance limitations +still persist. In this paper, we propose a multi-in-single-out (MISO) based VFI +method that does not rely on motion vector estimation, allowing it to +effectively model occlusions and nonlinear motion. Additionally, we introduce a +novel motion perceptual loss that enables MISO-VFI to better capture the +spatio-temporal correlations within the video frames. Our MISO-VFI method +achieves state-of-the-art results on VFI benchmarks Vimeo90K, Middlebury, and +UCF101, with a significant performance gap compared to existing approaches. + +
+
+
+
+
+ + ☆ Deep Equilibrium Diffusion Restoration with Parallel Sampling + + +
+ Diffusion-based image restoration (IR) methods aim to use diffusion models to +recover high-quality (HQ) images from degraded images and achieve promising +performance. Due to the inherent property of diffusion models, most of these +methods need long serial sampling chains to restore HQ images step-by-step. As +a result, it leads to expensive sampling time and high computation costs. +Moreover, such long sampling chains hinder understanding the relationship +between the restoration results and the inputs since it is hard to compute the +gradients in the whole chains. In this work, we aim to rethink the +diffusion-based IR models through a different perspective, i.e., a deep +equilibrium (DEQ) fixed point system. Specifically, we derive an analytical +solution by modeling the entire sampling chain in diffusion-based IR models as +a joint multivariate fixed point system. With the help of the analytical +solution, we are able to conduct single-image sampling in a parallel way and +restore HQ images without training. Furthermore, we compute fast gradients in +DEQ and found that initialization optimization can boost performance and +control the generation direction. Extensive experiments on benchmarks +demonstrate the effectiveness of our proposed method on typical IR tasks and +real-world settings. The code and models will be made publicly available. + +
+
+
+
+
+ + ☆ Predicting urban tree cover from incomplete point labels and limited + background information + + +
+ Trees inside cities are important for the urban microclimate, contributing +positively to the physical and mental health of the urban dwellers. Despite +their importance, often only limited information about city trees is available. +Therefore in this paper, we propose a method for mapping urban trees in +high-resolution aerial imagery using limited datasets and deep learning. Deep +learning has become best-practice for this task, however, existing approaches +rely on large and accurately labelled training datasets, which can be difficult +and expensive to obtain. However, often noisy and incomplete data may be +available that can be combined and utilized to solve more difficult tasks than +those datasets were intended for. This paper studies how to combine accurate +point labels of urban trees along streets with crowd-sourced annotations from +an open geographic database to delineate city trees in remote sensing images, a +task which is challenging even for humans. To that end, we perform semantic +segmentation of very high resolution aerial imagery using a fully convolutional +neural network. The main challenge is that our segmentation maps are sparsely +annotated and incomplete. Small areas around the point labels of the street +trees coming from official and crowd-sourced data are marked as foreground +class. Crowd-sourced annotations of streets, buildings, etc. define the +background class. Since the tree data is incomplete, we introduce a masking to +avoid class confusion. Our experiments in Hamburg, Germany, showed that the +system is able to produce tree cover maps, not limited to trees along streets, +without providing tree delineations. We evaluated the method on manually +labelled trees and show that performance drastically deteriorates if the open +geographic database is not used. + +
+
+
+
+
+ + ☆ Advancing Urban Renewal: An Automated Approach to Generating Historical + Arcade Facades with Stable Diffusion Models + + +
+ Urban renewal and transformation processes necessitate the preservation of +the historical urban fabric, particularly in districts known for their +architectural and historical significance. These regions, with their diverse +architectural styles, have traditionally required extensive preliminary +research, often leading to subjective results. However, the advent of machine +learning models has opened up new avenues for generating building facade +images. Despite this, creating high-quality images for historical district +renovations remains challenging, due to the complexity and diversity inherent +in such districts. In response to these challenges, our study introduces a new +methodology for automatically generating images of historical arcade facades, +utilizing Stable Diffusion models conditioned on textual descriptions. By +classifying and tagging a variety of arcade styles, we have constructed several +realistic arcade facade image datasets. We trained multiple low-rank adaptation +(LoRA) models to control the stylistic aspects of the generated images, +supplemented by ControlNet models for improved precision and authenticity. Our +approach has demonstrated high levels of precision, authenticity, and diversity +in the generated images, showing promising potential for real-world urban +renewal projects. This new methodology offers a more efficient and accurate +alternative to conventional design processes in urban renewal, bypassing issues +of unconvincing image details, lack of precision, and limited stylistic +variety. Future research could focus on integrating this two-dimensional image +generation with three-dimensional modeling techniques, providing a more +comprehensive solution for renovating architectural facades in historical +districts. + +
+
+ comment: HABITS OF THE ANTHROPOCENE - Proceedings of the 43rd ACADIA + Conference - Volume II: Proceedings book one, University of Colorado Denver, + Denver, Colorado, USA, 26-28 October 2023, pp. 616-625, CUMINCAD, 2023 +
+
+
+
+
+ + ☆ AKConv: Convolutional Kernel with Arbitrary Sampled Shapes and Arbitrary + Number of Parameters + + +
+ Neural networks based on convolutional operations have achieved remarkable +results in the field of deep learning, but there are two inherent flaws in +standard convolutional operations. On the one hand, the convolution operation +be confined to a local window and cannot capture information from other +locations, and its sampled shapes is fixed. On the other hand, the size of the +convolutional kernel is fixed to k $\times$ k, which is a fixed square shape, +and the number of parameters tends to grow squarely with size. It is obvious +that the shape and size of targets are various in different datasets and at +different locations. Convolutional kernels with fixed sample shapes and squares +do not adapt well to changing targets. In response to the above questions, the +Alterable Kernel Convolution (AKConv) is explored in this work, which gives the +convolution kernel an arbitrary number of parameters and arbitrary sampled +shapes to provide richer options for the trade-off between network overhead and +performance. In AKConv, we define initial positions for convolutional kernels +of arbitrary size by means of a new coordinate generation algorithm. To adapt +to changes for targets, we introduce offsets to adjust the shape of the samples +at each position. Moreover, we explore the effect of the neural network by +using the AKConv with the same size and different initial sampled shapes. +AKConv completes the process of efficient feature extraction by irregular +convolutional operations and brings more exploration options for convolutional +sampling shapes. Object detection experiments on representative datasets +COCO2017, VOC 7+12 and VisDrone-DET2021 fully demonstrate the advantages of +AKConv. AKConv can be used as a plug-and-play convolutional operation to +replace convolutional operations to improve network performance. The code for +the relevant tasks can be found at https://github.com/CV-ZhangXin/AKConv. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ SeaDSC: A video-based unsupervised method for dynamic scene change + detection in unmanned surface vehicles WACV 2024 + + +
+ Recently, there has been an upsurge in the research on maritime vision, where +a lot of works are influenced by the application of computer vision for +Unmanned Surface Vehicles (USVs). Various sensor modalities such as camera, +radar, and lidar have been used to perform tasks such as object detection, +segmentation, object tracking, and motion planning. A large subset of this +research is focused on the video analysis, since most of the current vessel +fleets contain the camera's onboard for various surveillance tasks. Due to the +vast abundance of the video data, video scene change detection is an initial +and crucial stage for scene understanding of USVs. This paper outlines our +approach to detect dynamic scene changes in USVs. To the best of our +understanding, this work represents the first investigation of scene change +detection in the maritime vision application. Our objective is to identify +significant changes in the dynamic scenes of maritime video data, particularly +those scenes that exhibit a high degree of resemblance. In our system for +dynamic scene change detection, we propose completely unsupervised learning +method. In contrast to earlier studies, we utilize a modified cutting-edge +generative picture model called VQ-VAE-2 to train on multiple marine datasets, +aiming to enhance the feature extraction. Next, we introduce our innovative +similarity scoring technique for directly calculating the level of similarity +in a sequence of consecutive frames by utilizing grid calculation on retrieved +features. The experiments were conducted using a nautical video dataset called +RoboWhaler to showcase the efficient performance of our technique. + +
+
+ comment: WACV 2024 conference +
+
+
+
+
+ + ☆ A 3D Multi-Style Cross-Modality Segmentation Framework for Segmenting + Vestibular Schwannoma and Cochlea + + +
+ The crossMoDA2023 challenge aims to segment the vestibular schwannoma +(sub-divided into intra- and extra-meatal components) and cochlea regions of +unlabeled hrT2 scans by leveraging labeled ceT1 scans. In this work, we +proposed a 3D multi-style cross-modality segmentation framework for the +crossMoDA2023 challenge, including the multi-style translation and +self-training segmentation phases. Considering heterogeneous distributions and +various image sizes in multi-institutional scans, we first utilize the min-max +normalization, voxel size resampling, and center cropping to obtain fixed-size +sub-volumes from ceT1 and hrT2 scans for training. Then, we perform the +multi-style image translation phase to overcome the intensity distribution +discrepancy between unpaired multi-modal scans. Specifically, we design three +different translation networks with 2D or 2.5D inputs to generate multi-style +and realistic target-like volumes from labeled ceT1 volumes. Finally, we +perform the self-training volumetric segmentation phase in the target domain, +which employs the nnU-Net framework and iterative self-training method using +pseudo-labels for training accurate segmentation models in the unlabeled target +domain. On the crossMoDA2023 validation dataset, our method produces promising +results and achieves the mean DSC values of 72.78% and 80.64% and ASSD values +of 5.85 mm and 0.25 mm for VS tumor and cochlea regions, respectively. +Moreover, for intra- and extra-meatal regions, our method achieves the DSC +values of 59.77% and 77.14%, respectively. + +
+
+ comment: Technical report of cmda2023 challenge +
+
+
+
+
+ + ☆ Decoupled DETR For Few-shot Object Detection + + +
+ Few-shot object detection (FSOD), an efficient method for addressing the +severe data-hungry problem, has been extensively discussed. Current works have +significantly advanced the problem in terms of model and data. However, the +overall performance of most FSOD methods still does not fulfill the desired +accuracy. In this paper we improve the FSOD model to address the severe issue +of sample imbalance and weak feature propagation. To alleviate modeling bias +from data-sufficient base classes, we examine the effect of decoupling the +parameters for classes with sufficient data and classes with few samples in +various ways. We design a base-novel categories decoupled DETR (DeDETR) for +FSOD. We also explore various types of skip connection between the encoder and +decoder for DETR. Besides, we notice that the best outputs could come from the +intermediate layer of the decoder instead of the last layer; therefore, we +build a unified decoder module that could dynamically fuse the decoder layers +as the output feature. We evaluate our model on commonly used datasets such as +PASCAL VOC and MSCOCO. Our results indicate that our proposed module could +achieve stable improvements of 5% to 10% in both fine-tuning and meta-learning +paradigms and has outperformed the highest score in recent works. + +
+
+
+
+
+ + ☆ CORE-MM: Complex Open-Ended Reasoning Evaluation For Multi-Modal Large + Language Models + + +
+ Multi-modal Large Language Models (MLLMs) are increasingly prominent in the +field of artificial intelligence. These models not only excel in traditional +vision-language tasks but also demonstrate impressive performance in +contemporary multi-modal benchmarks. Although many of these benchmarks attempt +to holistically evaluate MLLMs, they typically concentrate on basic reasoning +tasks, often yielding only simple yes/no or multi-choice responses. These +methods naturally lead to confusion and difficulties in conclusively +determining the reasoning capabilities of MLLMs. To mitigate this issue, we +manually curate a benchmark dataset specifically designed for MLLMs, with a +focus on complex reasoning tasks. Our benchmark comprises three key reasoning +categories: deductive, abductive, and analogical reasoning. The queries in our +dataset are intentionally constructed to engage the reasoning capabilities of +MLLMs in the process of generating answers. For a fair comparison across +various MLLMs, we incorporate intermediate reasoning steps into our evaluation +criteria. In instances where an MLLM is unable to produce a definitive answer, +its reasoning ability is evaluated by requesting intermediate reasoning steps. +If these steps align with our manual annotations, appropriate scores are +assigned. This evaluation scheme resembles methods commonly used in human +assessments, such as exams or assignments, and represents what we consider a +more effective assessment technique compared with existing benchmarks. We +evaluate a selection of representative MLLMs using this rigorously developed +open-ended multi-step elaborate reasoning benchmark, designed to challenge and +accurately measure their reasoning capabilities. The code and data will be +released at https://core-mm.github.io/ + +
+
+
+
+
+ + ☆ Does complimentary information from multispectral imaging improve face + presentation attack detection? SC + + +
+ Presentation Attack Detection (PAD) has been extensively studied, +particularly in the visible spectrum. With the advancement of sensing +technology beyond the visible range, multispectral imaging has gained +significant attention in this direction. We present PAD based on multispectral +images constructed for eight different presentation artifacts resulted from +three different artifact species. In this work, we introduce Face Presentation +Attack Multispectral (FPAMS) database to demonstrate the significance of +employing multispectral imaging. The goal of this work is to study +complementary information that can be combined in two different ways (image +fusion and score fusion) from multispectral imaging to improve the face PAD. +The experimental evaluation results present an extensive qualitative analysis +of 61650 sample multispectral images collected for bonafide and artifacts. The +PAD based on the score fusion and image fusion method presents superior +performance, demonstrating the significance of employing multispectral imaging +to detect presentation artifacts. + +
+
+ comment: Accepted in International IEEE Applied Sensing Conference (IEEE + APSCON) 2024 +
+
+
+
+
+ + ☆ NePF: Neural Photon Field for Single-Stage Inverse Rendering + + +
+ We present a novel single-stage framework, Neural Photon Field (NePF), to +address the ill-posed inverse rendering from multi-view images. Contrary to +previous methods that recover the geometry, material, and illumination in +multiple stages and extract the properties from various multi-layer perceptrons +across different neural fields, we question such complexities and introduce our +method - a single-stage framework that uniformly recovers all properties. NePF +achieves this unification by fully utilizing the physical implication behind +the weight function of neural implicit surfaces and the view-dependent +radiance. Moreover, we introduce an innovative coordinate-based illumination +model for rapid volume physically-based rendering. To regularize this +illumination, we implement the subsurface scattering model for diffuse +estimation. We evaluate our method on both real and synthetic datasets. The +results demonstrate the superiority of our approach in recovering high-fidelity +geometry and visual-plausible material attributes. + +
+
+
+
+
+ + ☆ Unearthing Common Inconsistency for Generalisable Deepfake Detection + + +
+ Deepfake has emerged for several years, yet efficient detection techniques +could generalize over different manipulation methods require further research. +While current image-level detection method fails to generalize to unseen +domains, owing to the domain-shift phenomenon brought by CNN's strong inductive +bias towards Deepfake texture, video-level one shows its potential to have both +generalization across multiple domains and robustness to compression. We argue +that although distinct face manipulation tools have different inherent bias, +they all disrupt the consistency between frames, which is a natural +characteristic shared by authentic videos. Inspired by this, we proposed a +detection approach by capturing frame inconsistency that broadly exists in +different forgery techniques, termed unearthing-common-inconsistency (UCI). +Concretely, the UCI network based on self-supervised contrastive learning can +better distinguish temporal consistency between real and fake videos from +multiple domains. We introduced a temporally-preserved module method to +introduce spatial noise perturbations, directing the model's attention towards +temporal information. Subsequently, leveraging a multi-view cross-correlation +learning module, we extensively learn the disparities in temporal +representations between genuine and fake samples. Extensive experiments +demonstrate the generalization ability of our method on unseen Deepfake +domains. + +
+
+ comment: 9 pages, 2 figures and 5 tables +
+
+
+
+
+ + ☆ Event Camera Data Dense Pre-training + + +
+ This paper introduces a self-supervised learning framework designed for +pre-training neural networks tailored to dense prediction tasks using event +camera data. Our approach utilizes solely event data for training. + Transferring achievements from dense RGB pre-training directly to event +camera data yields subpar performance. This is attributed to the spatial +sparsity inherent in an event image (converted from event data), where many +pixels do not contain information. To mitigate this sparsity issue, we encode +an event image into event patch features, automatically mine contextual +similarity relationships among patches, group the patch features into +distinctive contexts, and enforce context-to-context similarities to learn +discriminative event features. + For training our framework, we curate a synthetic event camera dataset +featuring diverse scene and motion patterns. Transfer learning performance on +downstream dense prediction tasks illustrates the superiority of our method +over state-of-the-art approaches. Notably, our single model secured the top +position in the challenging DSEC-Flow benchmark. + +
+
+
+
+
+ + ☆ Generalized Category Discovery in Semantic Segmentation + + +
+ This paper explores a novel setting called Generalized Category Discovery in +Semantic Segmentation (GCDSS), aiming to segment unlabeled images given prior +knowledge from a labeled set of base classes. The unlabeled images contain +pixels of the base class or novel class. In contrast to Novel Category +Discovery in Semantic Segmentation (NCDSS), there is no prerequisite for prior +knowledge mandating the existence of at least one novel class in each unlabeled +image. Besides, we broaden the segmentation scope beyond foreground objects to +include the entire image. Existing NCDSS methods rely on the aforementioned +priors, making them challenging to truly apply in real-world situations. We +propose a straightforward yet effective framework that reinterprets the GCDSS +challenge as a task of mask classification. Additionally, we construct a +baseline method and introduce the Neighborhood Relations-Guided Mask Clustering +Algorithm (NeRG-MaskCA) for mask categorization to address the fragmentation in +semantic representation. A benchmark dataset, Cityscapes-GCD, derived from the +Cityscapes dataset, is established to evaluate the GCDSS framework. Our method +demonstrates the feasibility of the GCDSS problem and the potential for +discovering and segmenting novel object classes in unlabeled images. We employ +the generated pseudo-labels from our approach as ground truth to supervise the +training of other models, thereby enabling them with the ability to segment +novel classes. It paves the way for further research in generalized category +discovery, broadening the horizons of semantic segmentation and its +applications. For details, please visit https://github.com/JethroPeng/GCDSS + +
+
+
+
+
+ + ☆ Liver Tumor Prediction with Advanced Attention Mechanisms Integrated + into a Depth-Based Variant Search Algorithm + + +
+ In recent days, Deep Learning (DL) techniques have become an emerging +transformation in the field of machine learning, artificial intelligence, +computer vision, and so on. Subsequently, researchers and industries have been +highly endorsed in the medical field, predicting and controlling diverse +diseases at specific intervals. Liver tumor prediction is a vital chore in +analyzing and treating liver diseases. This paper proposes a novel approach for +predicting liver tumors using Convolutional Neural Networks (CNN) and a +depth-based variant search algorithm with advanced attention mechanisms +(CNN-DS-AM). The proposed work aims to improve accuracy and robustness in +diagnosing and treating liver diseases. The anticipated model is assessed on a +Computed Tomography (CT) scan dataset containing both benign and malignant +liver tumors. The proposed approach achieved high accuracy in predicting liver +tumors, outperforming other state-of-the-art methods. Additionally, advanced +attention mechanisms were incorporated into the CNN model to enable the +identification and highlighting of regions of the CT scans most relevant to +predicting liver tumors. The results suggest that incorporating attention +mechanisms and a depth-based variant search algorithm into the CNN model is a +promising approach for improving the accuracy and robustness of liver tumor +prediction. It can assist radiologists in their diagnosis and treatment +planning. The proposed system achieved a high accuracy of 95.5% in predicting +liver tumors, outperforming other state-of-the-art methods. + +
+
+
+
+
+ + ☆ Seeing through the Mask: Multi-task Generative Mask Decoupling Face + Recognition + + +
+ The outbreak of COVID-19 pandemic make people wear masks more frequently than +ever. Current general face recognition system suffers from serious performance +degradation,when encountering occluded scenes. The potential reason is that +face features are corrupted by occlusions on key facial regions. To tackle this +problem, previous works either extract identity-related embeddings on feature +level by additional mask prediction, or restore the occluded facial part by +generative models. However, the former lacks visual results for model +interpretation, while the latter suffers from artifacts which may affect +downstream recognition. Therefore, this paper proposes a Multi-task gEnerative +mask dEcoupling face Recognition (MEER) network to jointly handle these two +tasks, which can learn occlusionirrelevant and identity-related representation +while achieving unmasked face synthesis. We first present a novel mask +decoupling module to disentangle mask and identity information, which makes the +network obtain purer identity features from visible facial components. Then, an +unmasked face is restored by a joint-training strategy, which will be further +used to refine the recognition network with an id-preserving loss. Experiments +on masked face recognition under realistic and synthetic occlusions benchmarks +demonstrate that the MEER can outperform the state-ofthe-art methods. + +
+
+
+
+
+ + ☆ What's left can't be right -- The remaining positional incompetence of + contrastive vision-language models + + +
+ Contrastive vision-language models like CLIP have been found to lack spatial +understanding capabilities. In this paper we discuss the possible causes of +this phenomenon by analysing both datasets and embedding space. By focusing on +simple left-right positional relations, we show that this behaviour is entirely +predictable, even with large-scale datasets, demonstrate that these relations +can be taught using synthetic data and show that this approach can generalise +well to natural images - improving the performance on left-right relations on +Visual Genome Relations. + +
+
+
+
+
+ + ☆ HandSight: DeCAF & Improved Fisher Vectors to Classify Clothing Color + and Texture with a Finger-Mounted Camera + + +
+ We demonstrate the use of DeCAF and Improved Fisher Vector image features to +classify clothing texture. The issue of choosing clothes is a problem for the +blind every day. This work attempts to solve the issue with a finger-mounted +camera and state-of-the-art classification algorithms. To evaluate our +solution, we collected 520 close-up images across 29 pieces of clothing. We +contribute (1) the HCTD, an image dataset taken with a NanEyeGS camera, a +camera small enough to be mounted on the finger, and (2) evaluations of +state-of-the-art recognition algorithms applied to our dataset - achieving an +accuracy >95%. Throughout the paper, we will discuss previous work, evaluate +the current work, and finally, suggest the project's future direction. + +
+
+ comment: 10 pages, 15 figures +
+
+
+
+
+ + ☆ Nepotistically Trained Generative-AI Models Collapse + + +
+ Trained on massive amounts of human-generated content, AI (artificial +intelligence) image synthesis is capable of reproducing semantically coherent +images that match the visual appearance of its training data. We show that when +retrained on even small amounts of their own creation, these generative-AI +models produce highly distorted images. We also show that this distortion +extends beyond the text prompts used in retraining, and that once poisoned, the +models struggle to fully heal even after retraining on only real images. + +
+
+
+
+
+ + ☆ PhysGaussian: Physics-Integrated 3D Gaussians for Generative Dynamics + + +
+ We introduce PhysGaussian, a new method that seamlessly integrates physically +grounded Newtonian dynamics within 3D Gaussians to achieve high-quality novel +motion synthesis. Employing a custom Material Point Method (MPM), our approach +enriches 3D Gaussian kernels with physically meaningful kinematic deformation +and mechanical stress attributes, all evolved in line with continuum mechanics +principles. A defining characteristic of our method is the seamless integration +between physical simulation and visual rendering: both components utilize the +same 3D Gaussian kernels as their discrete representations. This negates the +necessity for triangle/tetrahedron meshing, marching cubes, "cage meshes," or +any other geometry embedding, highlighting the principle of "what you see is +what you simulate (WS$^2$)." Our method demonstrates exceptional versatility +across a wide variety of materials--including elastic entities, metals, +non-Newtonian fluids, and granular materials--showcasing its strong +capabilities in creating diverse visual content with novel viewpoints and +movements. Our project page is at: https://xpandora.github.io/PhysGaussian/ + +
+
+
+
+
+ + ☆ DiffAvatar: Simulation-Ready Garment Optimization with Differentiable + Simulation + + +
+ The realism of digital avatars is crucial in enabling telepresence +applications with self-expression and customization. A key aspect of this +realism originates from the physical accuracy of both a true-to-life body shape +and clothing. While physical simulations can produce high-quality, realistic +motions for clothed humans, they require precise estimation of body shape and +high-quality garment assets with associated physical parameters for cloth +simulations. However, manually creating these assets and calibrating their +parameters is labor-intensive and requires specialized expertise. To address +this gap, we propose DiffAvatar, a novel approach that performs body and +garment co-optimization using differentiable simulation. By integrating +physical simulation into the optimization loop and accounting for the complex +nonlinear behavior of cloth and its intricate interaction with the body, our +framework recovers body and garment geometry and extracts important material +parameters in a physically plausible way. Our experiments demonstrate that our +approach generates realistic clothing and body shape that can be easily used in +downstream applications. + +
+
+
+
+
+ + ☆ Disentangling Structure and Appearance in ViT Feature Space + + +
+ We present a method for semantically transferring the visual appearance of +one natural image to another. Specifically, our goal is to generate an image in +which objects in a source structure image are "painted" with the visual +appearance of their semantically related objects in a target appearance image. +To integrate semantic information into our framework, our key idea is to +leverage a pre-trained and fixed Vision Transformer (ViT) model. Specifically, +we derive novel disentangled representations of structure and appearance +extracted from deep ViT features. We then establish an objective function that +splices the desired structure and appearance representations, interweaving them +together in the space of ViT features. Based on our objective function, we +propose two frameworks of semantic appearance transfer -- "Splice", which works +by training a generator on a single and arbitrary pair of structure-appearance +images, and "SpliceNet", a feed-forward real-time appearance transfer model +trained on a dataset of images from a specific domain. Our frameworks do not +involve adversarial training, nor do they require any additional input +information such as semantic segmentation or correspondences. We demonstrate +high-resolution results on a variety of in-the-wild image pairs, under +significant variations in the number of objects, pose, and appearance. Code and +supplementary material are available in our project page: splice-vit.github.io. + +
+
+ comment: Accepted to ACM Transactions on Graphics. arXiv admin note: + substantial text overlap with arXiv:2201.00424 +
+
+
+
+
+ + ☆ LABELMAKER: Automatic Semantic Label Generation from RGB-D Trajectories + + +
+ Semantic annotations are indispensable to train or evaluate perception +models, yet very costly to acquire. This work introduces a fully automated +2D/3D labeling framework that, without any human intervention, can generate +labels for RGB-D scans at equal (or better) level of accuracy than comparable +manually annotated datasets such as ScanNet. Our approach is based on an +ensemble of state-of-the-art segmentation models and 3D lifting through neural +rendering. We demonstrate the effectiveness of our LabelMaker pipeline by +generating significantly better labels for the ScanNet datasets and +automatically labelling the previously unlabeled ARKitScenes dataset. Code and +models are available at https://labelmaker.org + +
+
+
+
+
+ + ☆ ChemScraper: Graphics Extraction, Molecular Diagram Parsing, and + Annotated Data Generation for PDF Images + + +
+ Existing visual parsers for molecule diagrams translate pixel-based raster +images such as PNGs to chemical structure representations (e.g., SMILES). +However, PDFs created by word processors including \LaTeX{} and Word provide +explicit locations and shapes for characters, lines, and polygons. We +%introduce a method to extract symbols from born-digital PDF molecule images +and then apply simple graph transformations to capture both visual and chemical +structure in editable ChemDraw files (CDXML). Our fast ( PDF $\rightarrow$ +visual graph $\rightarrow$ chemical graph ) pipeline does not require GPUs, +Optical Character Recognition (OCR) or vectorization. We evaluate on standard +benchmarks using SMILES strings, along with a novel evaluation that provides +graph-based metrics and error compilation using LgEval. The geometric +information in born-digital PDFs produces a highly accurate parser, motivating +generating training data for visual parsers that recognize from raster images, +with extracted graphics, visual structure, and chemical structure as +annotations. To do this we render SMILES strings in Indigo, parse molecule +structure, and then validate recognized structure to select correct files. + +
+
+ comment: 20 pages without references, 10 figures, 3 Tables, submitted to + International Journal on Document Analysis and Recognition (IJDAR) +
+
+
+
+
+ + ☆ Conditional Modeling Based Automatic Video Summarization + + +
+ The aim of video summarization is to shorten videos automatically while +retaining the key information necessary to convey the overall story. Video +summarization methods mainly rely on visual factors, such as visual +consecutiveness and diversity, which may not be sufficient to fully understand +the content of the video. There are other non-visual factors, such as +interestingness, representativeness, and storyline consistency that should also +be considered for generating high-quality video summaries. Current methods do +not adequately take into account these non-visual factors, resulting in +suboptimal performance. In this work, a new approach to video summarization is +proposed based on insights gained from how humans create ground truth video +summaries. The method utilizes a conditional modeling perspective and +introduces multiple meaningful random variables and joint distributions to +characterize the key components of video summarization. Helper distributions +are employed to improve the training of the model. A conditional attention +module is designed to mitigate potential performance degradation in the +presence of multi-modal input. The proposed video summarization method +incorporates the above innovative design choices that aim to narrow the gap +between human-generated and machine-generated video summaries. Extensive +experiments show that the proposed approach outperforms existing methods and +achieves state-of-the-art performance on commonly used video summarization +datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + arXiv admin note: substantial text overlap with arXiv:2305.00455 +
+
+
+
+
+ + ☆ Model-aware 3D Eye Gaze from Weak and Few-shot Supervisions + + +
+ The task of predicting 3D eye gaze from eye images can be performed either by +(a) end-to-end learning for image-to-gaze mapping or by (b) fitting a 3D eye +model onto images. The former case requires 3D gaze labels, while the latter +requires eye semantics or landmarks to facilitate the model fitting. Although +obtaining eye semantics and landmarks is relatively easy, fitting an accurate +3D eye model on them remains to be very challenging due to its ill-posed nature +in general. On the other hand, obtaining large-scale 3D gaze data is cumbersome +due to the required hardware setups and computational demands. In this work, we +propose to predict 3D eye gaze from weak supervision of eye semantic +segmentation masks and direct supervision of a few 3D gaze vectors. The +proposed method combines the best of both worlds by leveraging large amounts of +weak annotations--which are easy to obtain, and only a few 3D gaze +vectors--which alleviate the difficulty of fitting 3D eye models on the +semantic segmentation of eye images. Thus, the eye gaze vectors, used in the +model fitting, are directly supervised using the few-shot gaze labels. +Additionally, we propose a transformer-based network architecture, that serves +as a solid baseline for our improvements. Our experiments in diverse settings +illustrate the significant benefits of the proposed method, achieving about 5 +degrees lower angular gaze error over the baseline, when only 0.05% 3D +annotations of the training images are used. The source code is available at +https://github.com/dimitris-christodoulou57/Model-aware_3D_Eye_Gaze. + +
+
+ comment: Accepted to ISMAR2023 as a poster paper +
+
+
+
+
+ + ☆ Uncertainty Estimation in Contrast-Enhanced MR Image Translation with + Multi-Axis Fusion + + +
+ In recent years, deep learning has been applied to a wide range of medical +imaging and image processing tasks. In this work, we focus on the estimation of +epistemic uncertainty for 3D medical image-to-image translation. We propose a +novel model uncertainty quantification method, Multi-Axis Fusion (MAF), which +relies on the integration of complementary information derived from multiple +views on volumetric image data. The proposed approach is applied to the task of +synthesizing contrast enhanced T1-weighted images based on native T1, T2 and +T2-FLAIR scans. The quantitative findings indicate a strong correlation +($\rho_{\text healthy} = 0.89$) between the mean absolute image synthetization +error and the mean uncertainty score for our MAF method. Hence, we consider MAF +as a promising approach to solve the highly relevant task of detecting +synthetization failures at inference time. + +
+
+
+
+
+ + ☆ Teaching Robots to Build Simulations of Themselves + + +
+ Simulation enables robots to plan and estimate the outcomes of prospective +actions without the need to physically execute them. We introduce a +self-supervised learning framework to enable robots model and predict their +morphology, kinematics and motor control using only brief raw video data, +eliminating the need for extensive real-world data collection and kinematic +priors. By observing their own movements, akin to humans watching their +reflection in a mirror, robots learn an ability to simulate themselves and +predict their spatial motion for various tasks. Our results demonstrate that +this self-learned simulation not only enables accurate motion planning but also +allows the robot to detect abnormalities and recover from damage. + +
+
+
+
+
+ + ☆ Applications of Large Scale Foundation Models for Autonomous Driving + + +
+ Since DARPA Grand Challenges (rural) in 2004/05 and Urban Challenges in 2007, +autonomous driving has been the most active field of AI applications. Recently +powered by large language models (LLMs), chat systems, such as chatGPT and +PaLM, emerge and rapidly become a promising direction to achieve artificial +general intelligence (AGI) in natural language processing (NLP). There comes a +natural thinking that we could employ these abilities to reformulate autonomous +driving. By combining LLM with foundation models, it is possible to utilize the +human knowledge, commonsense and reasoning to rebuild autonomous driving +systems from the current long-tailed AI dilemma. In this paper, we investigate +the techniques of foundation models and LLMs applied for autonomous driving, +categorized as simulation, world model, data annotation and planning or E2E +solutions etc. + +
+
+ comment: 42 pages +
+
+
+
+
+ + ☆ Fingerspelling PoseNet: Enhancing Fingerspelling Translation with + Pose-Based Transformer Models WACV 2024 + + +
+ We address the task of American Sign Language fingerspelling translation +using videos in the wild. We exploit advances in more accurate hand pose +estimation and propose a novel architecture that leverages the transformer +based encoder-decoder model enabling seamless contextual word translation. The +translation model is augmented by a novel loss term that accurately predicts +the length of the finger-spelled word, benefiting both training and inference. +We also propose a novel two-stage inference approach that re-ranks the +hypotheses using the language model capabilities of the decoder. Through +extensive experiments, we demonstrate that our proposed method outperforms the +state-of-the-art models on ChicagoFSWild and ChicagoFSWild+ achieving more than +10% relative improvement in performance. Our findings highlight the +effectiveness of our approach and its potential to advance fingerspelling +recognition in sign language translation. Code is also available at +https://github.com/pooyafayyaz/Fingerspelling-PoseNet. + +
+
+ comment: WACV 2024 +
+
+
+
+
+ + ☆ Mixing-Denoising Generalizable Occupancy Networks 3DV 2024 + + +
+ While current state-of-the-art generalizable implicit neural shape models +rely on the inductive bias of convolutions, it is still not entirely clear how +properties emerging from such biases are compatible with the task of 3D +reconstruction from point cloud. We explore an alternative approach to +generalizability in this context. We relax the intrinsic model bias (i.e. using +MLPs to encode local features as opposed to convolutions) and constrain the +hypothesis space instead with an auxiliary regularization related to the +reconstruction task, i.e. denoising. The resulting model is the first only-MLP +locally conditioned implicit shape reconstruction from point cloud network with +fast feed forward inference. Point cloud borne features and denoising offsets +are predicted from an exclusively MLP-made network in a single forward pass. A +decoder predicts occupancy probabilities for queries anywhere in space by +pooling nearby features from the point cloud order-invariantly, guided by +denoised relative positional encoding. We outperform the state-of-the-art +convolutional method while using half the number of model parameters. + +
+
+ comment: 3DV 2024 +
+
+
+
+
+ + ☆ Concept Sliders: LoRA Adaptors for Precise Control in Diffusion Models + + +
+ We present a method to create interpretable concept sliders that enable +precise control over attributes in image generations from diffusion models. Our +approach identifies a low-rank parameter direction corresponding to one concept +while minimizing interference with other attributes. A slider is created using +a small set of prompts or sample images; thus slider directions can be created +for either textual or visual concepts. Concept Sliders are plug-and-play: they +can be composed efficiently and continuously modulated, enabling precise +control over image generation. In quantitative experiments comparing to +previous editing techniques, our sliders exhibit stronger targeted edits with +lower interference. We showcase sliders for weather, age, styles, and +expressions, as well as slider compositions. We show how sliders can transfer +latents from StyleGAN for intuitive editing of visual concepts for which +textual description is difficult. We also find that our method can help address +persistent quality issues in Stable Diffusion XL including repair of object +deformations and fixing distorted hands. Our code, data, and trained sliders +are available at https://sliders.baulab.info/ + +
+
+
+
+
+ + ☆ DAS: A Deformable Attention to Capture Salient Information in CNNs + + +
+ Convolutional Neural Networks (CNNs) excel in local spatial pattern +recognition. For many vision tasks, such as object recognition and +segmentation, salient information is also present outside CNN's kernel +boundaries. However, CNNs struggle in capturing such relevant information due +to their confined receptive fields. Self-attention can improve a model's access +to global information but increases computational overhead. We present a fast +and simple fully convolutional method called DAS that helps focus attention on +relevant information. It uses deformable convolutions for the location of +pertinent image regions and separable convolutions for efficiency. DAS plugs +into existing CNNs and propagates relevant information using a gating +mechanism. Compared to the O(n^2) computational complexity of transformer-style +attention, DAS is O(n). Our claim is that DAS's ability to pay increased +attention to relevant features results in performance improvements when added +to popular CNNs for Image Classification and Object Detection. For example, DAS +yields an improvement on Stanford Dogs (4.47%), ImageNet (1.91%), and COCO AP +(3.3%) with base ResNet50 backbone. This outperforms other CNN attention +mechanisms while using similar or less FLOPs. Our code will be publicly +available. + +
+
+
+
+
+ + ♻ ☆ A Dual-Stream Neural Network Explains the Functional Segregation of + Dorsal and Ventral Visual Pathways in Human Brains + + +
+ The human visual system uses two parallel pathways for spatial processing and +object recognition. In contrast, computer vision systems tend to use a single +feedforward pathway, rendering them less robust, adaptive, or efficient than +human vision. To bridge this gap, we developed a dual-stream vision model +inspired by the human eyes and brain. At the input level, the model samples two +complementary visual patterns to mimic how the human eyes use magnocellular and +parvocellular retinal ganglion cells to separate retinal inputs to the brain. +At the backend, the model processes the separate input patterns through two +branches of convolutional neural networks (CNN) to mimic how the human brain +uses the dorsal and ventral cortical pathways for parallel visual processing. +The first branch (WhereCNN) samples a global view to learn spatial attention +and control eye movements. The second branch (WhatCNN) samples a local view to +represent the object around the fixation. Over time, the two branches interact +recurrently to build a scene representation from moving fixations. We compared +this model with the human brains processing the same movie and evaluated their +functional alignment by linear transformation. The WhereCNN and WhatCNN +branches were found to differentially match the dorsal and ventral pathways of +the visual cortex, respectively, primarily due to their different learning +objectives. These model-based results lead us to speculate that the distinct +responses and representations of the ventral and dorsal streams are more +influenced by their distinct goals in visual attention and object recognition +than by their specific bias or selectivity in retinal inputs. This dual-stream +model takes a further step in brain-inspired computer vision, enabling parallel +neural networks to actively explore and understand the visual surroundings. + +
+
+
+
+
+ + ♻ ☆ SWAT: Spatial Structure Within and Among Tokens IJCAI23 + + +
+ Modeling visual data as tokens (i.e., image patches) using attention +mechanisms, feed-forward networks or convolutions has been highly effective in +recent years. Such methods usually have a common pipeline: a tokenization +method, followed by a set of layers/blocks for information mixing, both within +and among tokens. When image patches are converted into tokens, they are often +flattened, discarding the spatial structure within each patch. As a result, any +processing that follows (eg: multi-head self-attention) may fail to recover +and/or benefit from such information. In this paper, we argue that models can +have significant gains when spatial structure is preserved during tokenization, +and is explicitly used during the mixing stage. We propose two key +contributions: (1) Structure-aware Tokenization and, (2) Structure-aware +Mixing, both of which can be combined with existing models with minimal effort. +We introduce a family of models (SWAT), showing improvements over the likes of +DeiT, MLP-Mixer and Swin Transformer, across multiple benchmarks including +ImageNet classification and ADE20K segmentation. Our code is available at +https://github.com/kkahatapitiya/SWAT. + +
+
+ comment: Accepted to be published at IJCAI23 +
+
+
+
+
+ + ♻ ☆ Balancing stability and plasticity in continual learning: the + readout-decomposition of activation change (RDAC) framework + + +
+ Continual learning (CL) algorithms strive to acquire new knowledge while +preserving prior information. However, this stability-plasticity trade-off +remains a central challenge. This paper introduces a framework that dissects +this trade-off, offering valuable insights into CL algorithms. The +Readout-Decomposition of Activation Change (RDAC) framework first addresses the +stability-plasticity dilemma and its relation to catastrophic forgetting. It +relates learning-induced activation changes in the range of prior readouts to +the degree of stability and changes in the null space to the degree of +plasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the +framework clarifies the stability-plasticity trade-offs of the popular +regularization algorithms Synaptic intelligence (SI), Elastic-weight +consolidation (EWC), and learning without Forgetting (LwF), and replay-based +algorithms Gradient episodic memory (GEM), and data replay. GEM and data replay +preserved stability and plasticity, while SI, EWC, and LwF traded off +plasticity for stability. The inability of the regularization algorithms to +maintain plasticity was linked to them restricting the change of activations in +the null space of the prior readout. Additionally, for one-hidden-layer linear +neural networks, we derived a gradient decomposition algorithm to restrict +activation change only in the range of the prior readouts, to maintain high +stability while not further sacrificing plasticity. Results demonstrate that +the algorithm maintained stability without significant plasticity loss. The +RDAC framework informs the behavior of existing CL algorithms and paves the way +for novel CL approaches. Finally, it sheds light on the connection between +learning-induced activation/representation changes and the stability-plasticity +dilemma, also offering insights into representational drift in biological +systems. + +
+
+ comment: 15 pages, 5 figures, Revision +
+
+
+
+
+ + ♻ ☆ Preserving Patient Privacy in MRI Scans: A Comprehensive Approach with + 3D Masked Autoencoders + + +
+ MRI scans provide valuable medical information, however they also contain +sensitive and personally identifiable information (PII) that needs to be +protected. Whereas MRI metadata is easily sanitized, MRI image data is a +privacy risk because it contains information to render highly-realistic 3D +visualizations of a patient's head, enabling malicious actors to possibly +identify the subject by cross-referencing a database. Data anonymization and +de-identification is concerned with ensuring the privacy and confidentiality of +individuals' personal information. Traditional MRI de-identification methods +remove privacy-sensitive parts (e.g. eyes, nose etc.) from a given scan. This +comes at the expense of introducing a domain shift that can throw off +downstream analyses. Recently, a GAN-based approach was proposed to de-identify +a patient's scan by remodeling it (\eg changing the face) rather than by +removing parts. In this work, we propose CP-MAE, a model that de-identifies the +face using masked autoencoders and that outperforms all previous approaches in +terms of downstream task performance as well as de-identification. With our +method we are able to synthesize scans of resolution up to $256^3$ (previously +$128^3$) which constitutes an eight-fold increase in the number of voxels. +Using our construction we were able to design a system that exhibits a highly +robust training stage, making it easy to fit the network on novel data. + +
+
+
+
+
+ + ♻ ☆ SynthEnsemble: A Fusion of CNN, Vision Transformer, and Hybrid Models + for Multi-Label Chest X-Ray Classification + + +
+ Chest X-rays are widely used to diagnose thoracic diseases, but the lack of +detailed information about these abnormalities makes it challenging to develop +accurate automated diagnosis systems, which is crucial for early detection and +effective treatment. To address this challenge, we employed deep learning +techniques to identify patterns in chest X-rays that correspond to different +diseases. We conducted experiments on the "ChestX-ray14" dataset using various +pre-trained CNNs, transformers, hybrid(CNN+Transformer) models and classical +models. The best individual model was the CoAtNet, which achieved an area under +the receiver operating characteristic curve (AUROC) of 84.2%. By combining the +predictions of all trained models using a weighted average ensemble where the +weight of each model was determined using differential evolution, we further +improved the AUROC to 85.4%, outperforming other state-of-the-art methods in +this field. Our findings demonstrate the potential of deep learning techniques, +particularly ensemble deep learning, for improving the accuracy of automatic +diagnosis of thoracic diseases from chest X-rays. + +
+
+ comment: Accepted in International Conference on Computer and Information + Technology (ICCIT) 2023 +
+
+
+
+
+ + ♻ ☆ Human Motion Tracking by Registering an Articulated Surface to 3-D + Points and Normals + + +
+ We address the problem of human motion tracking by registering a surface to +3-D data. We propose a method that iteratively computes two things: Maximum +likelihood estimates for both the kinematic and free-motion parameters of a +kinematic human-body representation, as well as probabilities that the data are +assigned either to a body part, or to an outlier cluster. We introduce a new +metric between observed points and normals on one side, and a parameterized +surface on the other side, the latter being defined as a blending over a set of +ellipsoids. We claim that this metric is well suited when one deals with either +visual-hull or visual-shape observations. We illustrate the method by tracking +human motions using sparse visual-shape data (3-D surface points and normals) +gathered from imperfect silhouettes. + +
+
+
+
+
+ + ♻ ☆ Image Matching with Scale Adjustment + + +
+ In this paper we address the problem of matching two images with two +different resolutions: a high-resolution image and a low-resolution one. The +difference in resolution between the two images is not known and without loss +of generality one of the images is assumed to be the high-resolution one. On +the premise that changes in resolution act as a smoothing equivalent to changes +in scale, a scale-space representation of the high-resolution image is +produced. Hence the one-to-one classical image matching paradigm becomes +one-to-many because the low-resolution image is compared with all the +scale-space representations of the high-resolution one. Key to the success of +such a process is the proper representation of the features to be matched in +scale-space. We show how to represent and extract interest points at variable +scales and we devise a method allowing the comparison of two images at two +different resolutions. The method comprises the use of photometric- and +rotation-invariant descriptors, a geometric model mapping the high-resolution +image onto a low-resolution image region, and an image matching strategy based +on local constraints and on the robust estimation of this geometric model. +Extensive experiments show that our matching method can be used for scale +changes up to a factor of 6. + +
+
+
+
+
+ + ♻ ☆ 3D SA-UNet: 3D Spatial Attention UNet with 3D ASPP for White Matter + Hyperintensities Segmentation + + +
+ White Matter Hyperintensity (WMH) is an imaging feature related to various +diseases such as dementia and stroke. Accurately segmenting WMH using computer +technology is crucial for early disease diagnosis. However, this task remains +challenging due to the small lesions with low contrast and high discontinuity +in the images, which contain limited contextual and spatial information. To +address this challenge, we propose a deep learning model called 3D Spatial +Attention U-Net (3D SA-UNet) for automatic WMH segmentation using only Fluid +Attenuation Inversion Recovery (FLAIR) scans. The 3D SA-UNet introduces a 3D +Spatial Attention Module that highlights important lesion features, such as +WMH, while suppressing unimportant regions. Additionally, to capture features +at different scales, we extend the Atrous Spatial Pyramid Pooling (ASPP) module +to a 3D version, enhancing the segmentation performance of the network. We +evaluate our method on publicly available dataset and demonstrate the +effectiveness of 3D spatial attention module and 3D ASPP in WMH segmentation. +Through experimental results, it has been demonstrated that our proposed 3D +SA-UNet model achieves higher accuracy compared to other state-of-the-art 3D +convolutional neural networks. + +
+
+
+
+
+ + ♻ ☆ Scattering Vision Transformer: Spectral Mixing Matters NeurIPS 2023 + + +
+ Vision transformers have gained significant attention and achieved +state-of-the-art performance in various computer vision tasks, including image +classification, instance segmentation, and object detection. However, +challenges remain in addressing attention complexity and effectively capturing +fine-grained information within images. Existing solutions often resort to +down-sampling operations, such as pooling, to reduce computational cost. +Unfortunately, such operations are non-invertible and can result in information +loss. In this paper, we present a novel approach called Scattering Vision +Transformer (SVT) to tackle these challenges. SVT incorporates a spectrally +scattering network that enables the capture of intricate image details. SVT +overcomes the invertibility issue associated with down-sampling operations by +separating low-frequency and high-frequency components. Furthermore, SVT +introduces a unique spectral gating network utilizing Einstein multiplication +for token and channel mixing, effectively reducing complexity. We show that SVT +achieves state-of-the-art performance on the ImageNet dataset with a +significant reduction in a number of parameters and FLOPS. SVT shows 2\% +improvement over LiTv2 and iFormer. SVT-H-S reaches 84.2\% top-1 accuracy, +while SVT-H-B reaches 85.2\% (state-of-art for base versions) and SVT-H-L +reaches 85.7\% (again state-of-art for large versions). SVT also shows +comparable results in other vision tasks such as instance segmentation. SVT +also outperforms other transformers in transfer learning on standard datasets +such as CIFAR10, CIFAR100, Oxford Flower, and Stanford Car datasets. The +project page is available on this +webpage.\url{https://badripatro.github.io/svt/}. + +
+
+ comment: Accepted @NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ TRACE: 5D Temporal Regression of Avatars with Dynamic Cameras in 3D + Environments + + +
+ Although the estimation of 3D human pose and shape (HPS) is rapidly +progressing, current methods still cannot reliably estimate moving humans in +global coordinates, which is critical for many applications. This is +particularly challenging when the camera is also moving, entangling human and +camera motion. To address these issues, we adopt a novel 5D representation +(space, time, and identity) that enables end-to-end reasoning about people in +scenes. Our method, called TRACE, introduces several novel architectural +components. Most importantly, it uses two new "maps" to reason about the 3D +trajectory of people over time in camera, and world, coordinates. An additional +memory unit enables persistent tracking of people even during long occlusions. +TRACE is the first one-stage method to jointly recover and track 3D humans in +global coordinates from dynamic cameras. By training it end-to-end, and using +full image information, TRACE achieves state-of-the-art performance on tracking +and HPS benchmarks. The code and dataset are released for research purposes. + +
+
+ comment: Project page: https://www.yusun.work/TRACE/TRACE.html +
+
+
+
+
+ + ♻ ☆ Adversarial Examples Are Not Real Features NeurIPS 2023 + + +
+ The existence of adversarial examples has been a mystery for years and +attracted much interest. A well-known theory by \citet{ilyas2019adversarial} +explains adversarial vulnerability from a data perspective by showing that one +can extract non-robust features from adversarial examples and these features +alone are useful for classification. However, the explanation remains quite +counter-intuitive since non-robust features are mostly noise features to +humans. In this paper, we re-examine the theory from a larger context by +incorporating multiple learning paradigms. Notably, we find that contrary to +their good usefulness under supervised learning, non-robust features attain +poor usefulness when transferred to other self-supervised learning paradigms, +such as contrastive learning, masked image modeling, and diffusion models. It +reveals that non-robust features are not really as useful as robust or natural +features that enjoy good transferability between these paradigms. Meanwhile, +for robustness, we also show that naturally trained encoders from robust +features are largely non-robust under AutoAttack. Our cross-paradigm +examination suggests that the non-robust features are not really useful but +more like paradigm-wise shortcuts, and robust features alone might be +insufficient to attain reliable model robustness. Code is available at +\url{https://github.com/PKU-ML/AdvNotRealFeatures}. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Women Wearing Lipstick: Measuring the Bias Between an Object and Its + Related Gender EMNLP + + +
+ In this paper, we investigate the impact of objects on gender bias in image +captioning systems. Our results show that only gender-specific objects have a +strong gender bias (e.g., women-lipstick). In addition, we propose a visual +semantic-based gender score that measures the degree of bias and can be used as +a plug-in for any image captioning system. Our experiments demonstrate the +utility of the gender score, since we observe that our score can measure the +bias relation between a caption and its related gender; therefore, our score +can be used as an additional metric to the existing Object Gender Co-Occ +approach. Code and data are publicly available at +\url{https://github.com/ahmedssabir/GenderScore}. + +
+
+ comment: EMNLP Findings 2023 +
+
+
+
+
+ + ♻ ☆ TokenFlow: Consistent Diffusion Features for Consistent Video Editing + + +
+ The generative AI revolution has recently expanded to videos. Nevertheless, +current state-of-the-art video models are still lagging behind image models in +terms of visual quality and user control over the generated content. In this +work, we present a framework that harnesses the power of a text-to-image +diffusion model for the task of text-driven video editing. Specifically, given +a source video and a target text-prompt, our method generates a high-quality +video that adheres to the target text, while preserving the spatial layout and +motion of the input video. Our method is based on a key observation that +consistency in the edited video can be obtained by enforcing consistency in the +diffusion feature space. We achieve this by explicitly propagating diffusion +features based on inter-frame correspondences, readily available in the model. +Thus, our framework does not require any training or fine-tuning, and can work +in conjunction with any off-the-shelf text-to-image editing method. We +demonstrate state-of-the-art editing results on a variety of real-world videos. +Webpage: https://diffusion-tokenflow.github.io/ + +
+
+
+
+
+ + ♻ ☆ Towards Hierarchical Regional Transformer-based Multiple Instance + Learning + + +
+ The classification of gigapixel histopathology images with deep multiple +instance learning models has become a critical task in digital pathology and +precision medicine. In this work, we propose a Transformer-based multiple +instance learning approach that replaces the traditional learned attention +mechanism with a regional, Vision Transformer inspired self-attention +mechanism. We present a method that fuses regional patch information to derive +slide-level predictions and show how this regional aggregation can be stacked +to hierarchically process features on different distance levels. To increase +predictive accuracy, especially for datasets with small, local morphological +features, we introduce a method to focus the image processing on high attention +regions during inference. Our approach is able to significantly improve +performance over the baseline on two histopathology datasets and points towards +promising directions for further research. + +
+
+ comment: 8 pages, LaTeX; header update after published, fixed typos +
+
+
+
+
+ + ♻ ☆ Joint covariance property under geometric image transformations for + spatio-temporal receptive fields according to the generalized Gaussian + derivative model for visual receptive fields + + +
+ The influence of natural image transformations on receptive field responses +is crucial for modelling visual operations in computer vision and biological +vision. In this regard, covariance properties with respect to geometric image +transformations in the earliest layers of the visual hierarchy are essential +for expressing robust image operations and for formulating invariant visual +operations at higher levels. This paper defines and proves a joint covariance +property under compositions of spatial scaling transformations, spatial affine +transformations, Galilean transformations and temporal scaling transformations, +which makes it possible to characterize how different types of image +transformations interact with each other. Specifically, the derived relations +show how the receptive field parameters need to be transformed, in order to +match the output from spatio-temporal receptive fields with the underlying +spatio-temporal image transformations. + +
+
+ comment: 7 pages +
+
+
+
+
+ + ♻ ☆ Learning Environment-Aware Affordance for 3D Articulated Object + Manipulation under Occlusions NeurIPS + 2023 + + +
+ Perceiving and manipulating 3D articulated objects in diverse environments is +essential for home-assistant robots. Recent studies have shown that point-level +affordance provides actionable priors for downstream manipulation tasks. +However, existing works primarily focus on single-object scenarios with +homogeneous agents, overlooking the realistic constraints imposed by the +environment and the agent's morphology, e.g., occlusions and physical +limitations. In this paper, we propose an environment-aware affordance +framework that incorporates both object-level actionable priors and environment +constraints. Unlike object-centric affordance approaches, learning +environment-aware affordance faces the challenge of combinatorial explosion due +to the complexity of various occlusions, characterized by their quantities, +geometries, positions and poses. To address this and enhance data efficiency, +we introduce a novel contrastive affordance learning framework capable of +training on scenes containing a single occluder and generalizing to scenes with +complex occluder combinations. Experiments demonstrate the effectiveness of our +proposed approach in learning affordance considering environment constraints. +Project page at https://chengkaiacademycity.github.io/EnvAwareAfford/ + +
+
+ comment: In 37th Conference on Neural Information Processing Systems (NeurIPS + 2023). Website at https://chengkaiacademycity.github.io/EnvAwareAfford/ +
+
+
+
+
+ + ♻ ☆ DragonDiffusion: Enabling Drag-style Manipulation on Diffusion Models + + +
+ Despite the ability of existing large-scale text-to-image (T2I) models to +generate high-quality images from detailed textual descriptions, they often +lack the ability to precisely edit the generated or real images. In this paper, +we propose a novel image editing method, DragonDiffusion, enabling Drag-style +manipulation on Diffusion models. Specifically, we construct classifier +guidance based on the strong correspondence of intermediate features in the +diffusion model. It can transform the editing signals into gradients via +feature correspondence loss to modify the intermediate representation of the +diffusion model. Based on this guidance strategy, we also build a multi-scale +guidance to consider both semantic and geometric alignment. Moreover, a +cross-branch self-attention is added to maintain the consistency between the +original image and the editing result. Our method, through an efficient design, +achieves various editing modes for the generated or real images, such as object +moving, object resizing, object appearance replacement, and content dragging. +It is worth noting that all editing and content preservation signals come from +the image itself, and the model does not require fine-tuning or additional +modules. Our source code will be available at +https://github.com/MC-E/DragonDiffusion. + +
+
+
+
+
+ + ♻ ☆ A Unified Framework for 3D Point Cloud Visual Grounding + + +
+ Thanks to its precise spatial referencing, 3D point cloud visual grounding is +essential for deep understanding and dynamic interaction in 3D environments, +encompassing 3D Referring Expression Comprehension (3DREC) and Segmentation +(3DRES). We argue that 3DREC and 3DRES should be unified in one framework, +which is also a natural progression in the community. To explain, 3DREC help +3DRES locate the referent, while 3DRES also facilitate 3DREC via more +fine-grained language-visual alignment. To achieve this, this paper takes the +initiative step to integrate 3DREC and 3DRES into a unified framework, termed +3D Referring Transformer (3DRefTR). Its key idea is to build upon a mature +3DREC model and leverage ready query embeddings and visual tokens from the +3DREC model to construct a dedicated mask branch. Specially, we propose +Superpoint Mask Branch, which serves a dual purpose: i) By harnessing on the +inherent association between the superpoints and point cloud, it eliminates the +heavy computational overhead on the high-resolution visual features for +upsampling; ii) By leveraging the heterogeneous CPU-GPU parallelism, while the +GPU is occupied generating visual and language tokens, the CPU concurrently +produces superpoints, equivalently accomplishing the upsampling computation. +This elaborate design enables 3DRefTR to achieve both well-performing 3DRES and +3DREC capacities with only a 6% additional latency compared to the original +3DREC model. Empirical evaluations affirm the superiority of 3DRefTR. +Specifically, on the ScanRefer dataset, 3DRefTR surpasses the state-of-the-art +3DRES method by 12.43% in mIoU and improves upon the SOTA 3DREC method by 0.6% +Acc@0.25IoU. The codes and models will be released soon. + +
+
+
+
+
+ + ♻ ☆ Estimating the Generalization in Deep Neural Networks via Sparsity + + +
+ Generalization is the key capability for deep neural networks (DNNs). +However, it is challenging to give a reliable measure of the generalization +ability of a DNN via only its nature. In this paper, we propose a novel method +for estimating the generalization gap based on network sparsity. In our method, +two key quantities are proposed first. They have close relationship with the +generalization ability and can be calculated directly from the training results +alone. Then a simple linear model involving two key quantities are constructed +to give accurate estimation of the generalization gap. By training DNNs with a +wide range of generalization gap on popular datasets, we show that our key +quantities and linear model could be efficient tools for estimating the +generalization gap of DNNs. + +
+
+
+
+
+ + ♻ ☆ Role Taxonomy of Units in Deep Neural Networks + + +
+ Identifying the role of network units in deep neural networks (DNNs) is +critical in many aspects including giving understandings on the mechanisms of +DNNs and building basic connections between deep learning and neuroscience. +However, there remains unclear on which roles the units in DNNs with different +generalization ability could present. To this end, we give role taxonomy of +units in DNNs via introducing the retrieval-of-function test, where units are +categorized into four types in terms of their functional preference on +separately the training set and testing set. We show that ratios of the four +categories are highly associated with the generalization ability of DNNs from +two distinct perspectives, based on which we give signs of DNNs with well +generalization. + +
+
+
+
+
+ + ♻ ☆ Rethinking the Backward Propagation for Adversarial Transferability NeurIPS 2023 + + +
+ Transfer-based attacks generate adversarial examples on the surrogate model, +which can mislead other black-box models without access, making it promising to +attack real-world applications. Recently, several works have been proposed to +boost adversarial transferability, in which the surrogate model is usually +overlooked. In this work, we identify that non-linear layers (e.g., ReLU, +max-pooling, etc.) truncate the gradient during backward propagation, making +the gradient w.r.t. input image imprecise to the loss function. We hypothesize +and empirically validate that such truncation undermines the transferability of +adversarial examples. Based on these findings, we propose a novel method called +Backward Propagation Attack (BPA) to increase the relevance between the +gradient w.r.t. input image and loss function so as to generate adversarial +examples with higher transferability. Specifically, BPA adopts a non-monotonic +function as the derivative of ReLU and incorporates softmax with temperature to +smooth the derivative of max-pooling, thereby mitigating the information loss +during the backward propagation of gradients. Empirical results on the ImageNet +dataset demonstrate that not only does our method substantially boost the +adversarial transferability, but it is also general to existing transfer-based +attacks. Code is available at https://github.com/Trustworthy-AI-Group/RPA. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Explicit3D: Graph Network with Spatial Inference for Single Image 3D + Object Detection + + +
+ Indoor 3D object detection is an essential task in single image scene +understanding, impacting spatial cognition fundamentally in visual reasoning. +Existing works on 3D object detection from a single image either pursue this +goal through independent predictions of each object or implicitly reason over +all possible objects, failing to harness relational geometric information +between objects. To address this problem, we propose a dynamic sparse graph +pipeline named Explicit3D based on object geometry and semantics features. +Taking the efficiency into consideration, we further define a relatedness score +and design a novel dynamic pruning algorithm followed by a cluster sampling +method for sparse scene graph generation and updating. Furthermore, our +Explicit3D introduces homogeneous matrices and defines new relative loss and +corner loss to model the spatial difference between target pairs explicitly. +Instead of using ground-truth labels as direct supervision, our relative and +corner loss are derived from the homogeneous transformation, which renders the +model to learn the geometric consistency between objects. The experimental +results on the SUN RGB-D dataset demonstrate that our Explicit3D achieves +better performance balance than the-state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ Domain Transfer in Latent Space (DTLS) Wins on Image Super-Resolution -- + a Non-Denoising Model + + +
+ Large scale image super-resolution is a challenging computer vision task, +since vast information is missing in a highly degraded image, say for example +forscale x16 super-resolution. Diffusion models are used successfully in recent +years in extreme super-resolution applications, in which Gaussian noise is used +as a means to form a latent photo-realistic space, and acts as a link between +the space of latent vectors and the latent photo-realistic space. There are +quite a few sophisticated mathematical derivations on mapping the statistics of +Gaussian noises making Diffusion Models successful. In this paper we propose a +simple approach which gets away from using Gaussian noise but adopts some basic +structures of diffusion models for efficient image super-resolution. +Essentially, we propose a DNN to perform domain transfer between neighbor +domains, which can learn the differences in statistical properties to +facilitate gradual interpolation with results of reasonable quality. Further +quality improvement is achieved by conditioning the domain transfer with +reference to the input LR image. Experimental results show that our method +outperforms not only state-of-the-art large scale super resolution models, but +also the current diffusion models for image super-resolution. The approach can +readily be extended to other image-to-image tasks, such as image enlightening, +inpainting, denoising, etc. + +
+
+
+
+
+ + ♻ ☆ Online Arbitrary Shaped Clustering through Correlated Gaussian Functions + + +
+ There is no convincing evidence that backpropagation is a biologically +plausible mechanism, and further studies of alternative learning methods are +needed. A novel online clustering algorithm is presented that can produce +arbitrary shaped clusters from inputs in an unsupervised manner, and requires +no prior knowledge of the number of clusters in the input data. This is +achieved by finding correlated outputs from functions that capture commonly +occurring input patterns. The algorithm can be deemed more biologically +plausible than model optimization through backpropagation, although practical +applicability may require additional research. However, the method yields +satisfactory results on several toy datasets on a noteworthy range of +hyperparameters. + +
+
+ comment: Corrected uniform distribution range; removed "average" from last + sentence in section 4 +
+
+
+
+
+ + ♻ ☆ Gaze Estimation on Spresense + + +
+ Gaze estimation is a valuable technology with numerous applications in fields +such as human-computer interaction, virtual reality, and medicine. This report +presents the implementation of a gaze estimation system using the Sony +Spresense microcontroller board and explores its performance in latency, +MAC/cycle, and power consumption. The report also provides insights into the +system's architecture, including the gaze estimation model used. Additionally, +a demonstration of the system is presented, showcasing its functionality and +performance. Our lightweight model TinyTrackerS is a mere 169Kb in size, using +85.8k parameters and runs on the Spresense platform at 3 FPS. + +
+
+
+
+
+ + ♻ ☆ TinyTracker: Ultra-Fast and Ultra-Low-Power Edge Vision In-Sensor for + Gaze Estimation + + +
+ Intelligent edge vision tasks encounter the critical challenge of ensuring +power and latency efficiency due to the typically heavy computational load they +impose on edge platforms.This work leverages one of the first "AI in sensor" +vision platforms, IMX500 by Sony, to achieve ultra-fast and ultra-low-power +end-to-end edge vision applications. We evaluate the IMX500 and compare it to +other edge platforms, such as the Google Coral Dev Micro and Sony Spresense, by +exploring gaze estimation as a case study. We propose TinyTracker, a highly +efficient, fully quantized model for 2D gaze estimation designed to maximize +the performance of the edge vision systems considered in this study. +TinyTracker achieves a 41x size reduction (600Kb) compared to iTracker [1] +without significant loss in gaze estimation accuracy (maximum of 0.16 cm when +fully quantized). TinyTracker's deployment on the Sony IMX500 vision sensor +results in end-to-end latency of around 19ms. The camera takes around 17.9ms to +read, process and transmit the pixels to the accelerator. The inference time of +the network is 0.86ms with an additional 0.24 ms for retrieving the results +from the sensor. The overall energy consumption of the end-to-end system is 4.9 +mJ, including 0.06 mJ for inference. The end-to-end study shows that IMX500 is +1.7x faster than CoralMicro (19ms vs 34.4ms) and 7x more power efficient (4.9mJ +VS 34.2mJ) + +
+
+
+
+
+ + ♻ ☆ SparseTrack: Multi-Object Tracking by Performing Scene Decomposition + based on Pseudo-Depth + + +
+ Exploring robust and efficient association methods has always been an +important issue in multiple-object tracking (MOT). Although existing tracking +methods have achieved impressive performance, congestion and frequent +occlusions still pose challenging problems in multi-object tracking. We reveal +that performing sparse decomposition on dense scenes is a crucial step to +enhance the performance of associating occluded targets. To this end, we +propose a pseudo-depth estimation method for obtaining the relative depth of +targets from 2D images. Secondly, we design a depth cascading matching (DCM) +algorithm, which can use the obtained depth information to convert a dense +target set into multiple sparse target subsets and perform data association on +these sparse target subsets in order from near to far. By integrating the +pseudo-depth method and the DCM strategy into the data association process, we +propose a new tracker, called SparseTrack. SparseTrack provides a new +perspective for solving the challenging crowded scene MOT problem. Only using +IoU matching, SparseTrack achieves comparable performance with the +state-of-the-art (SOTA) methods on the MOT17 and MOT20 benchmarks. Code and +models are publicly available at \url{https://github.com/hustvl/SparseTrack}. + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Avatar Knowledge Distillation: Self-ensemble Teacher Paradigm with + Uncertainty ACM MM 2023 + + +
+ Knowledge distillation is an effective paradigm for boosting the performance +of pocket-size model, especially when multiple teacher models are available, +the student would break the upper limit again. However, it is not economical to +train diverse teacher models for the disposable distillation. In this paper, we +introduce a new concept dubbed Avatars for distillation, which are the +inference ensemble models derived from the teacher. Concretely, (1) For each +iteration of distillation training, various Avatars are generated by a +perturbation transformation. We validate that Avatars own higher upper limit of +working capacity and teaching ability, aiding the student model in learning +diverse and receptive knowledge perspectives from the teacher model. (2) During +the distillation, we propose an uncertainty-aware factor from the variance of +statistical differences between the vanilla teacher and Avatars, to adjust +Avatars' contribution on knowledge transfer adaptively. Avatar Knowledge +Distillation AKD is fundamentally different from existing methods and refines +with the innovative view of unequal training. Comprehensive experiments +demonstrate the effectiveness of our Avatars mechanism, which polishes up the +state-of-the-art distillation methods for dense prediction without more extra +computational cost. The AKD brings at most 0.7 AP gains on COCO 2017 for Object +Detection and 1.83 mIoU gains on Cityscapes for Semantic Segmentation, +respectively. Code is available at https://github.com/Gumpest/AvatarKD. + +
+
+ comment: Accepted by ACM MM 2023 +
+
+
+
+
+ + ♻ ☆ Accurate and Efficient Stereo Matching via Attention Concatenation + Volume + + +
+ Stereo matching is a fundamental building block for many vision and robotics +applications. An informative and concise cost volume representation is vital +for stereo matching of high accuracy and efficiency. In this paper, we present +a novel cost volume construction method, named attention concatenation volume +(ACV), which generates attention weights from correlation clues to suppress +redundant information and enhance matching-related information in the +concatenation volume. The ACV can be seamlessly embedded into most stereo +matching networks, the resulting networks can use a more lightweight +aggregation network and meanwhile achieve higher accuracy. We further design a +fast version of ACV to enable real-time performance, named Fast-ACV, which +generates high likelihood disparity hypotheses and the corresponding attention +weights from low-resolution correlation clues to significantly reduce +computational and memory cost and meanwhile maintain a satisfactory accuracy. +The core idea of our Fast-ACV is volume attention propagation (VAP) which can +automatically select accurate correlation values from an upsampled correlation +volume and propagate these accurate values to the surroundings pixels with +ambiguous correlation clues. Furthermore, we design a highly accurate network +ACVNet and a real-time network Fast-ACVNet based on our ACV and Fast-ACV +respectively, which achieve the state-of-the-art performance on several +benchmarks (i.e., our ACVNet ranks the 2nd on KITTI 2015 and Scene Flow, and +the 3rd on KITTI 2012 and ETH3D among all the published methods; our +Fast-ACVNet outperforms almost all state-of-the-art real-time methods on Scene +Flow, KITTI 2012 and 2015 and meanwhile has better generalization ability) + +
+
+ comment: Accepted to TPAMI 2023. arXiv admin note: substantial text overlap + with arXiv:2203.02146 +
+
+
+
+
+ + ♻ ☆ Enhancing Robust Representation in Adversarial Training: Alignment and + Exclusion Criteria + + +
+ Deep neural networks are vulnerable to adversarial noise. Adversarial +Training (AT) has been demonstrated to be the most effective defense strategy +to protect neural networks from being fooled. However, we find AT omits to +learning robust features, resulting in poor performance of adversarial +robustness. To address this issue, we highlight two criteria of robust +representation: (1) Exclusion: \emph{the feature of examples keeps away from +that of other classes}; (2) Alignment: \emph{the feature of natural and +corresponding adversarial examples is close to each other}. These motivate us +to propose a generic framework of AT to gain robust representation, by the +asymmetric negative contrast and reverse attention. Specifically, we design an +asymmetric negative contrast based on predicted probabilities, to push away +examples of different classes in the feature space. Moreover, we propose to +weight feature by parameters of the linear classifier as the reverse attention, +to obtain class-aware feature and pull close the feature of the same class. +Empirical evaluations on three benchmark datasets show our methods greatly +advance the robustness of AT and achieve state-of-the-art performance. + +
+
+ comment: 10 pages, 9 figures, Submitted to TIFS +
+
+
+
+
+ + ♻ ☆ UIT-Saviors at MEDVQA-GI 2023: Improving Multimodal Learning with Image + Enhancement for Gastrointestinal Visual Question Answering + + +
+ In recent years, artificial intelligence has played an important role in +medicine and disease diagnosis, with many applications to be mentioned, one of +which is Medical Visual Question Answering (MedVQA). By combining computer +vision and natural language processing, MedVQA systems can assist experts in +extracting relevant information from medical image based on a given question +and providing precise diagnostic answers. The ImageCLEFmed-MEDVQA-GI-2023 +challenge carried out visual question answering task in the gastrointestinal +domain, which includes gastroscopy and colonoscopy images. Our team approached +Task 1 of the challenge by proposing a multimodal learning method with image +enhancement to improve the VQA performance on gastrointestinal images. The +multimodal architecture is set up with BERT encoder and different pre-trained +vision models based on convolutional neural network (CNN) and Transformer +architecture for features extraction from question and endoscopy image. The +result of this study highlights the dominance of Transformer-based vision +models over the CNNs and demonstrates the effectiveness of the image +enhancement process, with six out of the eight vision models achieving better +F1-Score. Our best method, which takes advantages of BERT+BEiT fusion and image +enhancement, achieves up to 87.25% accuracy and 91.85% F1-Score on the +development test set, while also producing good result on the private test set +with accuracy of 82.01%. + +
+
+ comment: ImageCLEF2023 published version: + https://ceur-ws.org/Vol-3497/paper-129.pdf +
+
+
+
+
+ + ♻ ☆ Battle of the Backbones: A Large-Scale Comparison of Pretrained Models + across Computer Vision Tasks NeurIPS 2023 + + +
+ Neural network based computer vision systems are typically built on a +backbone, a pretrained or randomly initialized feature extractor. Several years +ago, the default option was an ImageNet-trained convolutional neural network. +However, the recent past has seen the emergence of countless backbones +pretrained using various algorithms and datasets. While this abundance of +choice has led to performance increases for a range of systems, it is difficult +for practitioners to make informed decisions about which backbone to choose. +Battle of the Backbones (BoB) makes this choice easier by benchmarking a +diverse suite of pretrained models, including vision-language models, those +trained via self-supervised learning, and the Stable Diffusion backbone, across +a diverse set of computer vision tasks ranging from classification to object +detection to OOD generalization and more. Furthermore, BoB sheds light on +promising directions for the research community to advance computer vision by +illuminating strengths and weakness of existing approaches through a +comprehensive analysis conducted on more than 1500 training runs. While vision +transformers (ViTs) and self-supervised learning (SSL) are increasingly +popular, we find that convolutional neural networks pretrained in a supervised +fashion on large training sets still perform best on most tasks among the +models we consider. Moreover, in apples-to-apples comparisons on the same +architectures and similarly sized pretraining datasets, we find that SSL +backbones are highly competitive, indicating that future works should perform +SSL pretraining with advanced architectures and larger pretraining datasets. We +release the raw results of our experiments along with code that allows +researchers to put their own backbones through the gauntlet here: +https://github.com/hsouri/Battle-of-the-Backbones + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Transcript to Video: Efficient Clip Sequencing from Texts + + +
+ Among numerous videos shared on the web, well-edited ones always attract more +attention. However, it is difficult for inexperienced users to make well-edited +videos because it requires professional expertise and immense manual labor. To +meet the demands for non-experts, we present Transcript-to-Video -- a +weakly-supervised framework that uses texts as input to automatically create +video sequences from an extensive collection of shots. Specifically, we propose +a Content Retrieval Module and a Temporal Coherent Module to learn +visual-language representations and model shot sequencing styles, respectively. +For fast inference, we introduce an efficient search strategy for real-time +video clip sequencing. Quantitative results and user studies demonstrate +empirically that the proposed learning framework can retrieve content-relevant +shots while creating plausible video sequences in terms of style. Besides, the +run-time performance analysis shows that our framework can support real-world +applications. + +
+
+ comment: Tech Report; Demo and project page at + http://www.xiongyu.me/projects/transcript2video/ +
+
+
+
+
+ + ♻ ☆ LymphoML: An interpretable artificial intelligence-based method + identifies morphologic features that correlate with lymphoma subtype + + +
+ The accurate classification of lymphoma subtypes using hematoxylin and eosin +(H&E)-stained tissue is complicated by the wide range of morphological features +these cancers can exhibit. We present LymphoML - an interpretable machine +learning method that identifies morphologic features that correlate with +lymphoma subtypes. Our method applies steps to process H&E-stained tissue +microarray cores, segment nuclei and cells, compute features encompassing +morphology, texture, and architecture, and train gradient-boosted models to +make diagnostic predictions. LymphoML's interpretable models, developed on a +limited volume of H&E-stained tissue, achieve non-inferior diagnostic accuracy +to pathologists using whole-slide images and outperform black box deep-learning +on a dataset of 670 cases from Guatemala spanning 8 lymphoma subtypes. Using +SHapley Additive exPlanation (SHAP) analysis, we assess the impact of each +feature on model prediction and find that nuclear shape features are most +discriminative for DLBCL (F1-score: 78.7%) and classical Hodgkin lymphoma +(F1-score: 74.5%). Finally, we provide the first demonstration that a model +combining features from H&E-stained tissue with features from a standardized +panel of 6 immunostains results in a similar diagnostic accuracy (85.3%) to a +46-stain panel (86.1%). + +
+
+ comment: To be published in Proceedings of the 3rd Machine Learning for Health + symposium, Proceedings of Machine Learning Research (PMLR) +
+
+
+
+
+ + ♻ ☆ Manifold-Aware Self-Training for Unsupervised Domain Adaptation on + Regressing 6D Object Pose IJCAI 2023 + + +
+ Domain gap between synthetic and real data in visual regression (e.g. 6D pose +estimation) is bridged in this paper via global feature alignment and local +refinement on the coarse classification of discretized anchor classes in target +space, which imposes a piece-wise target manifold regularization into +domain-invariant representation learning. Specifically, our method incorporates +an explicit self-supervised manifold regularization, revealing consistent +cumulative target dependency across domains, to a self-training scheme (e.g. +the popular Self-Paced Self-Training) to encourage more discriminative +transferable representations of regression tasks. Moreover, learning unified +implicit neural functions to estimate relative direction and distance of +targets to their nearest class bins aims to refine target classification +predictions, which can gain robust performance against inconsistent feature +scaling sensitive to UDA regressors. Experiment results on three public +benchmarks of the challenging 6D pose estimation task can verify the +effectiveness of our method, consistently achieving superior performance to the +state-of-the-art for UDA on 6D pose estimation. + +
+
+ comment: Accepted by IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ UniMOS: A Universal Framework For Multi-Organ Segmentation Over + Label-Constrained Datasets + + +
+ Machine learning models for medical images can help physicians diagnose and +manage diseases. However, due to the fact that medical image annotation +requires a great deal of manpower and expertise, as well as the fact that +clinical departments perform image annotation based on task orientation, there +is the problem of having fewer medical image annotation data with more +unlabeled data and having many datasets that annotate only a single organ. In +this paper, we present UniMOS, the first universal framework for achieving the +utilization of fully and partially labeled images as well as unlabeled images. +Specifically, we construct a Multi-Organ Segmentation (MOS) module over +fully/partially labeled data as the basenet and designed a new target adaptive +loss. Furthermore, we incorporate a semi-supervised training module that +combines consistent regularization and pseudolabeling techniques on unlabeled +data, which significantly improves the segmentation of unlabeled data. +Experiments show that the framework exhibits excellent performance in several +medical image segmentation tasks compared to other advanced methods, and also +significantly improves data utilization and reduces annotation cost. Code and +models are available at: https://github.com/lw8807001/UniMOS. + +
+
+ comment: Accepted by BIBM2023 +
+
+
+
+
+ + ♻ ☆ SAM-CLIP: Merging Vision Foundation Models towards Semantic and Spatial + Understanding + + +
+ The landscape of publicly available vision foundation models (VFMs), such as +CLIP and Segment Anything Model (SAM), is expanding rapidly. VFMs are endowed +with distinct capabilities stemming from their pre-training objectives. For +instance, CLIP excels in semantic understanding, while SAM specializes in +spatial understanding for segmentation. In this work, we introduce a simple +recipe to efficiently merge VFMs into a unified model that absorbs their +expertise. Our method integrates techniques of multi-task learning, continual +learning, and distillation. Further, it demands significantly less +computational cost compared to traditional multi-task training from scratch, +and it only needs a small fraction of the pre-training datasets that were +initially used to train individual models. By applying our method to SAM and +CLIP, we obtain SAM-CLIP: a unified model that combines the capabilities of SAM +and CLIP into a single vision transformer. Compared with deploying SAM and CLIP +independently, our merged model, SAM-CLIP, reduces storage and compute costs +for inference, making it well-suited for edge device applications. We show that +SAM-CLIP not only retains the foundational strengths of SAM and CLIP, but also +introduces synergistic functionalities, notably in zero-shot semantic +segmentation, where SAM-CLIP establishes new state-of-the-art results on 5 +benchmarks. It outperforms previous models that are specifically designed for +this task by a large margin, including +6.8% and +5.9% mean IoU improvement on +Pascal-VOC and COCO-Stuff datasets, respectively. + +
+
+
+
+
+ + ♻ ☆ Diagonal Hierarchical Consistency Learning for Semi-supervised Medical + Image Segmentation + + +
+ Medical image segmentation, which is essential for many clinical +applications, has achieved almost human-level performance via data-driven deep +learning techniques. Nevertheless, its performance is predicated upon the +costly process of manually annotating a vast amount of medical images. To this +end, we propose a novel framework for robust semi-supervised medical image +segmentation using diagonal hierarchical consistency learning (DiHC-Net). +First, it is composed of multiple sub-models with identical multi-scale +architecture but with distinct sub-layers, such as up-sampling and +normalisation layers. Second, along with mutual consistency, a novel diagonal +hierarchical consistency is enforced between one model's intermediate and final +prediction and other models' soft pseudo labels in a diagonal hierarchical +fashion. Experimental results verify the efficacy of our simple framework, +outperforming all previous approaches on public Left Atrium (LA) dataset. + +
+
+ comment: 5 pages, 2 figures, and 2 tables. Corrected typos and errors +
+
+
+
+
+ + ♻ ☆ Finding AI-Generated Faces in the Wild + + +
+ AI-based image generation has continued to rapidly improve, producing +increasingly more realistic images with fewer obvious visual flaws. +AI-generated images are being used to create fake online profiles which in turn +are being used for spam, fraud, and disinformation campaigns. As the general +problem of detecting any type of manipulated or synthesized content is +receiving increasing attention, here we focus on a more narrow task of +distinguishing a real face from an AI-generated face. This is particularly +applicable when tackling inauthentic online accounts with a fake user profile +photo. We show that by focusing on only faces, a more resilient and +general-purpose artifact can be detected that allows for the detection of +AI-generated faces from a variety of GAN- and diffusion-based synthesis +engines, and across image resolutions (as low as 128 x 128 pixels) and +qualities. + +
+
+ comment: Removed anonymization of the LinkedIn platform +
+
+
+
+
+ + ♻ ☆ Learn the Time to Learn: Replay Scheduling in Continual Learning + + +
+ Replay methods are known to be successful at mitigating catastrophic +forgetting in continual learning scenarios despite having limited access to +historical data. However, storing historical data is cheap in many real-world +settings, yet replaying all historical data is often prohibited due to +processing time constraints. In such settings, we propose that continual +learning systems should learn the time to learn and schedule which tasks to +replay at different time steps. We first demonstrate the benefits of our +proposal by using Monte Carlo tree search to find a proper replay schedule, and +show that the found replay schedules can outperform fixed scheduling policies +when combined with various replay methods in different continual learning +settings. Additionally, we propose a framework for learning replay scheduling +policies with reinforcement learning. We show that the learned policies can +generalize better in new continual learning scenarios compared to equally +replaying all seen tasks, without added computational cost. Our study reveals +the importance of learning the time to learn in continual learning, which +brings current research closer to real-world needs. + +
+
+ comment: Published in TMLR (2023) +
+
+
+
+
+ + ♻ ☆ Multimodal Machine Learning in Image-Based and Clinical Biomedicine: + Survey and Prospects + + +
+ Machine learning (ML) applications in medical artificial intelligence (AI) +systems have shifted from traditional and statistical methods to increasing +application of deep learning models. This survey navigates the current +landscape of multimodal ML, focusing on its profound impact on medical image +analysis and clinical decision support systems. Emphasizing challenges and +innovations in addressing multimodal representation, fusion, translation, +alignment, and co-learning, the paper explores the transformative potential of +multimodal models for clinical predictions. It also questions practical +implementation of such models, bringing attention to the dynamics between +decision support systems and healthcare providers. Despite advancements, +challenges such as data biases and the scarcity of "big data" in many +biomedical domains persist. We conclude with a discussion on effective +innovation and collaborative efforts to further the miss + +
+
+
+
+
+ + ♻ ☆ Real-Time Helmet Violation Detection in AI City Challenge 2023 with + Genetic Algorithm-Enhanced YOLOv5 + + +
+ This research focuses on real-time surveillance systems as a means for +tackling the issue of non-compliance with helmet regulations, a practice that +considerably amplifies the risk for motorcycle drivers or riders. Despite the +well-established advantages of helmet usage, achieving widespread compliance +remains challenging due to diverse contributing factors. To effectively address +this concern, real-time monitoring and enforcement of helmet laws have been +proposed as a plausible solution. However, previous attempts at real-time +helmet violation detection have been hindered by their limited ability to +operate in real-time. To overcome this limitation, the current paper introduces +a novel real-time helmet violation detection system that utilizes the YOLOv5 +single-stage object detection model. This model is trained on the 2023 NVIDIA +AI City Challenge 2023 Track 5 dataset. The optimal hyperparameters for +training the model are determined using genetic algorithms. Additionally, data +augmentation and various sampling techniques are implemented to enhance the +model's performance. The efficacy of the models is evaluated using precision, +recall, and mean Average Precision (mAP) metrics. The results demonstrate +impressive precision, recall, and mAP scores of 0.848, 0.599, and 0.641, +respectively for the training data. Furthermore, the model achieves notable mAP +score of 0.6667 for the test datasets, leading to a commendable 4th place rank +in the public leaderboard. This innovative approach represents a notable +breakthrough in the field and holds immense potential to substantially enhance +motorcycle safety. By enabling real-time monitoring and enforcement +capabilities, this system has the capacity to contribute towards increased +compliance with helmet laws, thereby effectively reducing the risks faced by +motorcycle riders and passengers. + +
+
+
+
+
+ + ♻ ☆ CartiMorph: a framework for automated knee articular cartilage + morphometrics + + +
+ We introduce CartiMorph, a framework for automated knee articular cartilage +morphometrics. It takes an image as input and generates quantitative metrics +for cartilage subregions, including the percentage of full-thickness cartilage +loss (FCL), mean thickness, surface area, and volume. CartiMorph leverages the +power of deep learning models for hierarchical image feature representation. +Deep learning models were trained and validated for tissue segmentation, +template construction, and template-to-image registration. We established +methods for surface-normal-based cartilage thickness mapping, FCL estimation, +and rule-based cartilage parcellation. Our cartilage thickness map showed less +error in thin and peripheral regions. We evaluated the effectiveness of the +adopted segmentation model by comparing the quantitative metrics obtained from +model segmentation and those from manual segmentation. The root-mean-squared +deviation of the FCL measurements was less than 8%, and strong correlations +were observed for the mean thickness (Pearson's correlation coefficient $\rho +\in [0.82,0.97]$), surface area ($\rho \in [0.82,0.98]$) and volume ($\rho \in +[0.89,0.98]$) measurements. We compared our FCL measurements with those from a +previous study and found that our measurements deviated less from the ground +truths. We observed superior performance of the proposed rule-based cartilage +parcellation method compared with the atlas-based approach. CartiMorph has the +potential to promote imaging biomarkers discovery for knee osteoarthritis. + +
+
+ comment: This preprint is an proofread version of a paper published in Medical + Image Analysis (2023), which can be found at + https://doi.org/10.1016/j.media.2023.103035 +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Graph Variational Embedding Collaborative Filtering PAKDD2024 + + +
+ The customization of recommended content to users holds significant +importance in enhancing user experiences across a wide spectrum of applications +such as e-commerce, music, and shopping. Graph-based methods have achieved +considerable performance by capturing user-item interactions. However, these +methods tend to utilize randomly constructed embeddings in the dataset used for +training the recommender, which lacks any user preferences. Here, we propose +the concept of variational embeddings as a means of pre-training the +recommender system to improve the feature propagation through the layers of +graph convolutional networks (GCNs). The graph variational embedding +collaborative filtering (GVECF) is introduced as a novel framework to +incorporate representations learned through a variational graph auto-encoder +which are embedded into a GCN-based collaborative filtering. This approach +effectively transforms latent high-order user-item interactions into more +trainable vectors, ultimately resulting in better performance in terms of +recall and normalized discounted cumulative gain(NDCG) metrics. The experiments +conducted on benchmark datasets demonstrate that our proposed method achieves +up to 13.78% improvement in the recall over the test data. + +
+
+ comment: Submitted for PAKDD2024 conference,12 pages +
+
+
+
+
+ + ☆ Control in Hybrid Chatbots + + +
+ Customer data typically is held in database systems, which can be seen as +rule-based knowledge base, whereas businesses increasingly want to benefit from +the capabilities of large, pre-trained language models. + In this technical report, we describe a case study of how a commercial rule +engine and an integrated neural chatbot may be integrated, and what level of +control that particular integration mode leads to. We also discuss alternative +ways (including past ways realized in other systems) how researchers strive to +maintain control and avoid what has recently been called model "hallucination". + +
+
+ comment: 12 pages, 3 figures +
+
+
+
+
+ + ☆ Towards Robust Text Retrieval with Progressive Learning + + +
+ Retrieval augmentation has become an effective solution to empower large +language models (LLMs) with external and verified knowledge sources from the +database, which overcomes the limitations and hallucinations of LLMs in +handling up-to-date and domain-specific information. However, existing +embedding models for text retrieval usually have three non-negligible +limitations. First, the number and diversity of samples in a batch are too +restricted to supervise the modeling of textual nuances at scale. Second, the +high proportional noise are detrimental to the semantic correctness and +consistency of embeddings. Third, the equal treatment to easy and difficult +samples would cause sub-optimum convergence of embeddings with poorer +generalization. In this paper, we propose the PEG, a progressively learned +embeddings for robust text retrieval. Specifically, we increase the training +in-batch negative samples to 80,000, and for each query, we extracted five hard +negatives. Concurrently, we incorporated a progressive learning mechanism, +enabling the model to dynamically modulate its attention to the samples +throughout the entire training process. Additionally, PEG is trained on more +than 100 million data, encompassing a wide range of domains (e.g., finance, +medicine, and tourism) and covering various tasks (e.g., question-answering, +machine reading comprehension, and similarity matching). Extensive experiments +conducted on C-MTEB and DuReader demonstrate that PEG surpasses +state-of-the-art embeddings in retrieving true positives, highlighting its +significant potential for applications in LLMs. Our model is publicly available +at https://huggingface.co/TownsWu/PEG. + +
+
+
+
+
+ + ☆ Conditional Modeling Based Automatic Video Summarization + + +
+ The aim of video summarization is to shorten videos automatically while +retaining the key information necessary to convey the overall story. Video +summarization methods mainly rely on visual factors, such as visual +consecutiveness and diversity, which may not be sufficient to fully understand +the content of the video. There are other non-visual factors, such as +interestingness, representativeness, and storyline consistency that should also +be considered for generating high-quality video summaries. Current methods do +not adequately take into account these non-visual factors, resulting in +suboptimal performance. In this work, a new approach to video summarization is +proposed based on insights gained from how humans create ground truth video +summaries. The method utilizes a conditional modeling perspective and +introduces multiple meaningful random variables and joint distributions to +characterize the key components of video summarization. Helper distributions +are employed to improve the training of the model. A conditional attention +module is designed to mitigate potential performance degradation in the +presence of multi-modal input. The proposed video summarization method +incorporates the above innovative design choices that aim to narrow the gap +between human-generated and machine-generated video summaries. Extensive +experiments show that the proposed approach outperforms existing methods and +achieves state-of-the-art performance on commonly used video summarization +datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + arXiv admin note: substantial text overlap with arXiv:2305.00455 +
+
+
+
+
+ + ☆ Multi-view Graph Convolution for Participant Recommendation + + +
+ Social networks have become essential for people's lives. The proliferation +of web services further expands social networks at an unprecedented scale, +leading to immeasurable commercial value for online platforms. Recently, the +group buying (GB) business mode is prevalent and also becoming more popular in +E-commerce. GB explicitly forms groups of users with similar interests to +secure better discounts from the merchants, often operating within social +networks. It is a novel way to further unlock the commercial value by +explicitly utilizing the online social network in E-commerce. Participant +recommendation, a fundamental problem emerging together with GB, aims to find +the participants for a launched group buying process with an initiator and a +target item to increase the GB success rate. This paper proposes Multi-View +Graph Convolution for Participant Recommendation (MVPRec) to tackle this +problem. To differentiate the roles of users (Initiator/Participant) within the +GB process, we explicitly reconstruct historical GB data into initiator-view +and participant-view graphs. Together with the social graph, we obtain a +multi-view user representation with graph encoders. Then MVPRec fuses the GB +and social representation with an attention module to obtain the user +representation and learns a matching score with the initiator's social friends +via a multi-head attention mechanism. Social friends with the Top-k matching +score are recommended for the corresponding GB process. Experiments on three +datasets justify the effectiveness of MVPRec in the emerging participant +recommendation problem. + +
+
+ comment: 10 pages, 5 figures, 2023 IEEE International Conference on Big Data +
+
+
+
+
+ + ☆ App for Resume-Based Job Matching with Speech Interviews and Grammar + Analysis: A Review + + +
+ Through the advancement in natural language processing (NLP), specifically in +speech recognition, fully automated complex systems functioning on voice input +have started proliferating in areas such as home automation. These systems have +been termed Automatic Speech Recognition Systems (ASR). In this review paper, +we explore the feasibility of an end-to-end system providing speech and text +based natural language processing for job interview preparation as well as +recommendation of relevant job postings. We also explore existing +recommender-based systems and note their limitations. This literature review +would help us identify the approaches and limitations of the various similar +use-cases of NLP technology for our upcoming project. + +
+
+ comment: 4 pages, 2 figures, literature review +
+
+
+
+
+ + ♻ ☆ Inverse Learning with Extremely Sparse Feedback for Recommendation WSDM 2024 + + +
+ Modern personalized recommendation services often rely on user feedback, +either explicit or implicit, to improve the quality of services. Explicit +feedback refers to behaviors like ratings, while implicit feedback refers to +behaviors like user clicks. However, in the scenario of full-screen video +viewing experiences like Tiktok and Reels, the click action is absent, +resulting in unclear feedback from users, hence introducing noises in modeling +training. Existing approaches on de-noising recommendation mainly focus on +positive instances while ignoring the noise in a large amount of sampled +negative feedback. In this paper, we propose a meta-learning method to annotate +the unlabeled data from loss and gradient perspectives, which considers the +noises in both positive and negative instances. Specifically, we first propose +an Inverse Dual Loss (IDL) to boost the true label learning and prevent the +false label learning. Then we further propose an Inverse Gradient (IG) method +to explore the correct updating gradient and adjust the updating based on +meta-learning. Finally, we conduct extensive experiments on both benchmark and +industrial datasets where our proposed method can significantly improve AUC by +9.25% against state-of-the-art methods. Further analysis verifies the proposed +inverse learning framework is model-agnostic and can improve a variety of +recommendation backbones. The source code, along with the best hyper-parameter +settings, is available at this link: +https://github.com/Guanyu-Lin/InverseLearning. + +
+
+ comment: WSDM 2024 +
+
+
+
+
+
+
+
+ + Machine Learning 122 + +
+
+
+ + ☆ Hourglass Tokenizer for Efficient Transformer-Based 3D Human Pose + Estimation + + +
+ Transformers have been successfully applied in the field of video-based 3D +human pose estimation. However, the high computational costs of these video +pose transformers (VPTs) make them impractical on resource-constrained devices. +In this paper, we present a plug-and-play pruning-and-recovering framework, +called Hourglass Tokenizer (HoT), for efficient transformer-based 3D human pose +estimation from videos. Our HoT begins with pruning pose tokens of redundant +frames and ends with recovering full-length tokens, resulting in a few pose +tokens in the intermediate transformer blocks and thus improving the model +efficiency. To effectively achieve this, we propose a token pruning cluster +(TPC) that dynamically selects a few representative tokens with high semantic +diversity while eliminating the redundancy of video frames. In addition, we +develop a token recovering attention (TRA) to restore the detailed +spatio-temporal information based on the selected tokens, thereby expanding the +network output to the original full-length temporal resolution for fast +inference. Extensive experiments on two benchmark datasets (i.e., Human3.6M and +MPI-INF-3DHP) demonstrate that our method can achieve both high efficiency and +estimation accuracy compared to the original VPT models. For instance, applying +to MotionBERT and MixSTE on Human3.6M, our HoT can save nearly 50% FLOPs +without sacrificing accuracy and nearly 40% FLOPs with only 0.2% accuracy drop, +respectively. Our source code will be open-sourced. + +
+
+
+
+
+ + ☆ LQ-LoRA: Low-rank Plus Quantized Matrix Decomposition for Efficient + Language Model Finetuning + + +
+ We propose a simple approach for memory-efficient adaptation of pretrained +language models. Our approach uses an iterative algorithm to decompose each +pretrained matrix into a high-precision low-rank component and a +memory-efficient quantized component. During finetuning, the quantized +component remains fixed and only the low-rank component is updated. We present +an integer linear programming formulation of the quantization component which +enables dynamic configuration of quantization parameters (e.g., bit-width, +block size) for each matrix given an overall target memory budget. We further +explore a data-aware version of the algorithm which uses an approximation of +the Fisher information matrix to weight the reconstruction objective during +matrix decomposition. Experiments on adapting RoBERTa and LLaMA-2 (7B and 70B) +demonstrate that our low-rank plus quantized matrix decomposition approach +(LQ-LoRA) outperforms strong QLoRA and GPTQ-LoRA baselines and moreover enables +more aggressive quantization. For example, on the OpenAssistant benchmark +LQ-LoRA is able to learn a 2.5-bit LLaMA-2 model that is competitive with a +model finetuned with 4-bit QLoRA. When finetuned on a language modeling +calibration dataset, LQ-LoRA can also be used for model compression; in this +setting our 2.75-bit LLaMA-2-70B model (which has 2.85 bits on average when +including the low-rank components and requires 27GB of GPU memory) is +competitive with the original model in full precision. + +
+
+
+
+
+ + ☆ Risk-averse Batch Active Inverse Reward Design + + +
+ Designing a perfect reward function that depicts all the aspects of the +intended behavior is almost impossible, especially generalizing it outside of +the training environments. Active Inverse Reward Design (AIRD) proposed the use +of a series of queries, comparing possible reward functions in a single +training environment. This allows the human to give information to the agent +about suboptimal behaviors, in order to compute a probability distribution over +the intended reward function. However, it ignores the possibility of unknown +features appearing in real-world environments, and the safety measures needed +until the agent completely learns the reward function. I improved this method +and created Risk-averse Batch Active Inverse Reward Design (RBAIRD), which +constructs batches, sets of environments the agent encounters when being used +in the real world, processes them sequentially, and, for a predetermined number +of iterations, asks queries that the human needs to answer for each environment +of the batch. After this process is completed in one batch, the probabilities +have been improved and are transferred to the next batch. This makes it capable +of adapting to real-world scenarios and learning how to treat unknown features +it encounters for the first time. I also integrated a risk-averse planner, +similar to that of Inverse Reward Design (IRD), which samples a set of reward +functions from the probability distribution and computes a trajectory that +takes the most certain rewards possible. This ensures safety while the agent is +still learning the reward function, and enables the use of this approach in +situations where cautiousness is vital. RBAIRD outperformed the previous +approaches in terms of efficiency, accuracy, and action certainty, demonstrated +quick adaptability to new, unknown features, and can be more widely used for +the alignment of crucial, powerful AI models. + +
+
+ comment: 14 pages, 12 figures +
+
+
+
+
+ + ☆ BrainWash: A Poisoning Attack to Forget in Continual Learning + + +
+ Continual learning has gained substantial attention within the deep learning +community, offering promising solutions to the challenging problem of +sequential learning. Yet, a largely unexplored facet of this paradigm is its +susceptibility to adversarial attacks, especially with the aim of inducing +forgetting. In this paper, we introduce "BrainWash," a novel data poisoning +method tailored to impose forgetting on a continual learner. By adding the +BrainWash noise to a variety of baselines, we demonstrate how a trained +continual learner can be induced to forget its previously learned tasks +catastrophically, even when using these continual learning baselines. An +important feature of our approach is that the attacker requires no access to +previous tasks' data and is armed merely with the model's current parameters +and the data belonging to the most recent task. Our extensive experiments +highlight the efficacy of BrainWash, showcasing degradation in performance +across various regularization-based continual learning methods. + +
+
+
+
+
+ + ☆ Exploring Lip Segmentation Techniques in Computer Vision: A Comparative + Analysis + + +
+ Lip segmentation is crucial in computer vision, especially for lip reading. +Despite extensive face segmentation research, lip segmentation has received +limited attention. The aim of this study is to compare state-of-the-art lip +segmentation models using a standardized setting and a publicly available +dataset. Five techniques, namely EHANet, Mask2Former, BiSeNet V2, PIDNet, and +STDC1, are qualitatively selected based on their reported performance, +inference time, code availability, recency, and popularity. The CelebAMask-HQ +dataset, comprising manually annotated face images, is used to fairly assess +the lip segmentation performance of the selected models. Inference experiments +are conducted on a Raspberry Pi4 to emulate limited computational resources. +The results show that Mask2Former and EHANet have the best performances in +terms of mIoU score. BiSeNet V2 demonstrate competitive performance, while +PIDNet excels in recall but has lower precision. Most models present inference +time ranging from 1000 to around 3000 milliseconds on a Raspberry Pi4, with +PIDNet having the lowest mean inference time. This study provides a +comprehensive evaluation of lip segmentation models, highlighting their +performance and inference times. The findings contribute to the development of +lightweight techniques and establish benchmarks for future advances in lip +segmentation, especially in IoT and edge computing scenarios. + +
+
+
+
+
+ + ☆ Machine-Learned Atomic Cluster Expansion Potentials for Fast and + Quantum-Accurate Thermal Simulations of Wurtzite AlN + + +
+ Using the atomic cluster expansion (ACE) framework, we develop a machine +learning interatomic potential for fast and accurately modelling the phonon +transport properties of wurtzite aluminum nitride. The predictive power of the +ACE potential against density functional theory (DFT) is demonstrated across a +broad range of properties of w-AlN, including ground-state lattice parameters, +specific heat capacity, coefficients of thermal expansion, bulk modulus, and +harmonic phonon dispersions. Validation of lattice thermal conductivity is +further carried out by comparing the ACE-predicted values to the DFT +calculations and experiments, exhibiting the overall capability of our ACE +potential in sufficiently describing anharmonic phonon interactions. As a +practical application, we perform a lattice dynamics analysis using the +potential to unravel the effects of biaxial strains on thermal conductivity and +phonon properties of w-AlN, which is identified as a significant tuning factor +for near-junction thermal design of w-AlN-based electronics. + +
+
+
+
+
+ + ☆ Leveraging Previous Facial Action Units Knowledge for Emotion + Recognition on Faces + + +
+ People naturally understand emotions, thus permitting a machine to do the +same could open new paths for human-computer interaction. Facial expressions +can be very useful for emotion recognition techniques, as these are the biggest +transmitters of non-verbal cues capable of being correlated with emotions. +Several techniques are based on Convolutional Neural Networks (CNNs) to extract +information in a machine learning process. However, simple CNNs are not always +sufficient to locate points of interest on the face that can be correlated with +emotions. In this work, we intend to expand the capacity of emotion recognition +techniques by proposing the usage of Facial Action Units (AUs) recognition +techniques to recognize emotions. This recognition will be based on the Facial +Action Coding System (FACS) and computed by a machine learning system. In +particular, our method expands over EmotiRAM, an approach for multi-cue emotion +recognition, in which we improve over their facial encoding module. + +
+
+
+
+
+ + ☆ Evaluating Supervision Levels Trade-Offs for Infrared-Based People + Counting WACV + + +
+ Object detection models are commonly used for people counting (and +localization) in many applications but require a dataset with costly bounding +box annotations for training. Given the importance of privacy in people +counting, these models rely more and more on infrared images, making the task +even harder. In this paper, we explore how weaker levels of supervision can +affect the performance of deep person counting architectures for image +classification and point-level localization. Our experiments indicate that +counting people using a CNN Image-Level model achieves competitive results with +YOLO detectors and point-level models, yet provides a higher frame rate and a +similar amount of model parameters. + +
+
+ comment: Accepted in IEEE/CVF Winter Conference on Applications of Computer + Vision (WACV) 2024 +
+
+
+
+
+ + ☆ Adaptive Training Distributions with Scalable Online Bilevel + Optimization + + +
+ Large neural networks pretrained on web-scale corpora are central to modern +machine learning. In this paradigm, the distribution of the large, +heterogeneous pretraining data rarely matches that of the application domain. +This work considers modifying the pretraining distribution in the case where +one has a small sample of data reflecting the targeted test conditions. We +propose an algorithm motivated by a recent formulation of this setting as an +online, bilevel optimization problem. With scalability in mind, our algorithm +prioritizes computing gradients at training points which are likely to most +improve the loss on the targeted distribution. Empirically, we show that in +some cases this approach is beneficial over existing strategies from the domain +adaptation literature but may not succeed in other cases. We propose a simple +test to evaluate when our approach can be expected to work well and point +towards further research to address current limitations. + +
+
+
+
+
+ + ☆ Provably Efficient CVaR RL in Low-rank MDPs + + +
+ We study risk-sensitive Reinforcement Learning (RL), where we aim to maximize +the Conditional Value at Risk (CVaR) with a fixed risk tolerance $\tau$. Prior +theoretical work studying risk-sensitive RL focuses on the tabular Markov +Decision Processes (MDPs) setting. To extend CVaR RL to settings where state +space is large, function approximation must be deployed. We study CVaR RL in +low-rank MDPs with nonlinear function approximation. Low-rank MDPs assume the +underlying transition kernel admits a low-rank decomposition, but unlike prior +linear models, low-rank MDPs do not assume the feature or state-action +representation is known. We propose a novel Upper Confidence Bound (UCB) +bonus-driven algorithm to carefully balance the interplay between exploration, +exploitation, and representation learning in CVaR RL. We prove that our +algorithm achieves a sample complexity of $\tilde{O}\left(\frac{H^7 A^2 +d^4}{\tau^2 \epsilon^2}\right)$ to yield an $\epsilon$-optimal CVaR, where $H$ +is the length of each episode, $A$ is the capacity of action space, and $d$ is +the dimension of representations. Computational-wise, we design a novel +discretized Least-Squares Value Iteration (LSVI) algorithm for the CVaR +objective as the planning oracle and show that we can find the near-optimal +policy in a polynomial running time with a Maximum Likelihood Estimation +oracle. To our knowledge, this is the first provably efficient CVaR RL +algorithm in low-rank MDPs. + +
+
+ comment: The first three authors contribute equally and are ordered randomly +
+
+
+
+
+ + ☆ What Can AutoML Do For Continual Learning? + + +
+ This position paper outlines the potential of AutoML for incremental +(continual) learning to encourage more research in this direction. Incremental +learning involves incorporating new data from a stream of tasks and +distributions to learn enhanced deep representations and adapt better to new +tasks. However, a significant limitation of incremental learners is that most +current techniques freeze the backbone architecture, hyperparameters, and the +order & structure of the learning tasks throughout the learning and adaptation +process. We strongly believe that AutoML offers promising solutions to address +these limitations, enabling incremental learning to adapt to more diverse +real-world tasks. Therefore, instead of directly proposing a new method, this +paper takes a step back by posing the question: "What can AutoML do for +incremental learning?" We outline three key areas of research that can +contribute to making incremental learners more dynamic, highlighting concrete +opportunities to apply AutoML methods in novel ways as well as entirely new +challenges for AutoML research. + +
+
+
+
+
+ + ☆ NNG-Mix: Improving Semi-supervised Anomaly Detection with Pseudo-anomaly + Generation + + +
+ Anomaly detection (AD) is essential in identifying rare and often critical +events in complex systems, finding applications in fields such as network +intrusion detection, financial fraud detection, and fault detection in +infrastructure and industrial systems. While AD is typically treated as an +unsupervised learning task due to the high cost of label annotation, it is more +practical to assume access to a small set of labeled anomaly samples from +domain experts, as is the case for semi-supervised anomaly detection. +Semi-supervised and supervised approaches can leverage such labeled data, +resulting in improved performance. In this paper, rather than proposing a new +semi-supervised or supervised approach for AD, we introduce a novel algorithm +for generating additional pseudo-anomalies on the basis of the limited labeled +anomalies and a large volume of unlabeled data. This serves as an augmentation +to facilitate the detection of new anomalies. Our proposed algorithm, named +Nearest Neighbor Gaussian Mixup (NNG-Mix), efficiently integrates information +from both labeled and unlabeled data to generate pseudo-anomalies. We compare +the performance of this novel algorithm with commonly applied augmentation +techniques, such as Mixup and Cutout. We evaluate NNG-Mix by training various +existing semi-supervised and supervised anomaly detection algorithms on the +original training data along with the generated pseudo-anomalies. Through +extensive experiments on 57 benchmark datasets in ADBench, reflecting different +data types, we demonstrate that NNG-Mix outperforms other data augmentation +methods. It yields significant performance improvements compared to the +baselines trained exclusively on the original training data. Notably, NNG-Mix +yields up to 16.4%, 8.8%, and 8.0% improvements on Classical, CV, and NLP +datasets in ADBench. Our source code will be available at +https://github.com/donghao51/NNG-Mix. + +
+
+
+
+
+ + ☆ Correlated Attention in Transformers for Multivariate Time Series + + +
+ Multivariate time series (MTS) analysis prevails in real-world applications +such as finance, climate science and healthcare. The various self-attention +mechanisms, the backbone of the state-of-the-art Transformer-based models, +efficiently discover the temporal dependencies, yet cannot well capture the +intricate cross-correlation between different features of MTS data, which +inherently stems from complex dynamical systems in practice. To this end, we +propose a novel correlated attention mechanism, which not only efficiently +captures feature-wise dependencies, but can also be seamlessly integrated +within the encoder blocks of existing well-known Transformers to gain +efficiency improvement. In particular, correlated attention operates across +feature channels to compute cross-covariance matrices between queries and keys +with different lag values, and selectively aggregate representations at the +sub-series level. This architecture facilitates automated discovery and +representation learning of not only instantaneous but also lagged +cross-correlations, while inherently capturing time series auto-correlation. +When combined with prevalent Transformer baselines, correlated attention +mechanism constitutes a better alternative for encoder-only architectures, +which are suitable for a wide range of tasks including imputation, anomaly +detection and classification. Extensive experiments on the aforementioned tasks +consistently underscore the advantages of correlated attention mechanism in +enhancing base Transformer models, and demonstrate our state-of-the-art results +in imputation, anomaly detection and classification. + +
+
+
+
+
+ + ☆ Estimation of entropy-regularized optimal transport maps between + non-compactly supported measures + + +
+ This paper addresses the problem of estimating entropy-regularized optimal +transport (EOT) maps with squared-Euclidean cost between source and target +measures that are subGaussian. In the case that the target measure is compactly +supported or strongly log-concave, we show that for a recently proposed +in-sample estimator, the expected squared $L^2$-error decays at least as fast +as $O(n^{-1/3})$ where $n$ is the sample size. For the general subGaussian case +we show that the expected $L^1$-error decays at least as fast as $O(n^{-1/6})$, +and in both cases we have polynomial dependence on the regularization +parameter. While these results are suboptimal compared to known results in the +case of compactness of both the source and target measures (squared $L^2$-error +converging at a rate $O(n^{-1})$) and for when the source is subGaussian while +the target is compactly supported (squared $L^2$-error converging at a rate +$O(n^{-1/2})$), their importance lie in eliminating the compact support +requirements. The proof technique makes use of a bias-variance decomposition +where the variance is controlled using standard concentration of measure +results and the bias is handled by T1-transport inequalities along with sample +complexity results in estimation of EOT cost under subGaussian assumptions. Our +experimental results point to a looseness in controlling the variance terms and +we conclude by posing several open problems. + +
+
+ comment: 30 pages, 7 figures +
+
+
+
+
+ + ☆ Ovarian Cancer Data Analysis using Deep Learning: A Systematic Review + from the Perspectives of Key Features of Data Analysis and AI Assurance + + +
+ Background and objectives: By extracting this information, Machine or Deep +Learning (ML/DL)-based autonomous data analysis tools can assist clinicians and +cancer researchers in discovering patterns and relationships from complex data +sets. Many DL-based analyses on ovarian cancer (OC) data have recently been +published. These analyses are highly diverse in various aspects of cancer +(e.g., subdomain(s) and cancer type they address) and data analysis features. +However, a comprehensive understanding of these analyses in terms of these +features and AI assurance (AIA) is currently lacking. This systematic review +aims to fill this gap by examining the existing literature and identifying +important aspects of OC data analysis using DL, explicitly focusing on the key +features and AI assurance perspectives. Methods: The PRISMA framework was used +to conduct comprehensive searches in three journal databases. Only studies +published between 2015 and 2023 in peer-reviewed journals were included in the +analysis. Results: In the review, a total of 96 DL-driven analyses were +examined. The findings reveal several important insights regarding DL-driven +ovarian cancer data analysis: - Most studies 71% (68 out of 96) focused on +detection and diagnosis, while no study addressed the prediction and prevention +of OC. - The analyses were predominantly based on samples from a non-diverse +population (75% (72/96 studies)), limited to a geographic location or country. +- Only a small proportion of studies (only 33% (32/96)) performed integrated +analyses, most of which used homogeneous data (clinical or omics). - Notably, a +mere 8.3% (8/96) of the studies validated their models using external and +diverse data sets, highlighting the need for enhanced model validation, and - +The inclusion of AIA in cancer data analysis is in a very early stage; only +2.1% (2/96) explicitly addressed AIA through explainability. + +
+
+
+
+
+ + ☆ Deep Calibration of Market Simulations using Neural Density Estimators + and Embedding Networks + + +
+ The ability to construct a realistic simulator of financial exchanges, +including reproducing the dynamics of the limit order book, can give insight +into many counterfactual scenarios, such as a flash crash, a margin call, or +changes in macroeconomic outlook. In recent years, agent-based models have been +developed that reproduce many features of an exchange, as summarised by a set +of stylised facts and statistics. However, the ability to calibrate simulators +to a specific period of trading remains an open challenge. In this work, we +develop a novel approach to the calibration of market simulators by leveraging +recent advances in deep learning, specifically using neural density estimators +and embedding networks. We demonstrate that our approach is able to correctly +identify high probability parameter sets, both when applied to synthetic and +historical data, and without reliance on manually selected or weighted +ensembles of stylised facts. + +
+
+ comment: 4th ACM International Conference on AI in Finance (ICAIF 2023) +
+
+
+
+
+ + ☆ Certification of Distributional Individual Fairness + + +
+ Providing formal guarantees of algorithmic fairness is of paramount +importance to socially responsible deployment of machine learning algorithms. +In this work, we study formal guarantees, i.e., certificates, for individual +fairness (IF) of neural networks. We start by introducing a novel convex +approximation of IF constraints that exponentially decreases the computational +cost of providing formal guarantees of local individual fairness. We highlight +that prior methods are constrained by their focus on global IF certification +and can therefore only scale to models with a few dozen hidden neurons, thus +limiting their practical impact. We propose to certify distributional +individual fairness which ensures that for a given empirical distribution and +all distributions within a $\gamma$-Wasserstein ball, the neural network has +guaranteed individually fair predictions. Leveraging developments in +quasi-convex optimization, we provide novel and efficient certified bounds on +distributional individual fairness and show that our method allows us to +certify and regularize neural networks that are several orders of magnitude +larger than those considered by prior works. Moreover, we study real-world +distribution shifts and find our bounds to be a scalable, practical, and sound +source of IF guarantees. + +
+
+ comment: 21 Pages, Neural Information Processing Systems 2023 +
+
+
+
+
+ + ☆ Continual Learning: Applications and the Road Forward + + +
+ Continual learning is a sub-field of machine learning, which aims to allow +machine learning models to continuously learn on new data, by accumulating +knowledge without forgetting what was learned in the past. In this work, we +take a step back, and ask: "Why should one care about continual learning in the +first place?". We set the stage by surveying recent continual learning papers +published at three major machine learning conferences, and show that +memory-constrained settings dominate the field. Then, we discuss five open +problems in machine learning, and even though they seem unrelated to continual +learning at first sight, we show that continual learning will inevitably be +part of their solution. These problems are model-editing, personalization, +on-device learning, faster (re-)training and reinforcement learning. Finally, +by comparing the desiderata from these unsolved problems and the current +assumptions in continual learning, we highlight and discuss four future +directions for continual learning research. We hope that this work offers an +interesting perspective on the future of continual learning, while displaying +its potential value and the paths we have to pursue in order to make it +successful. This work is the result of the many discussions the authors had at +the Dagstuhl seminar on Deep Continual Learning, in March 2023. + +
+
+
+
+
+ + ☆ Real-Time Surface-to-Air Missile Engagement Zone Prediction Using + Simulation and Machine Learning + + +
+ Surface-to-Air Missiles (SAMs) are crucial in modern air defense systems. A +critical aspect of their effectiveness is the Engagement Zone (EZ), the spatial +region within which a SAM can effectively engage and neutralize a target. +Notably, the EZ is intrinsically related to the missile's maximum range; it +defines the furthest distance at which a missile can intercept a target. The +accurate computation of this EZ is essential but challenging due to the dynamic +and complex factors involved, which often lead to high computational costs and +extended processing times when using conventional simulation methods. In light +of these challenges, our study investigates the potential of machine learning +techniques, proposing an approach that integrates machine learning with a +custom-designed simulation tool to train supervised algorithms. We leverage a +comprehensive dataset of pre-computed SAM EZ simulations, enabling our model to +accurately predict the SAM EZ for new input parameters. It accelerates SAM EZ +simulations, enhances air defense strategic planning, and provides real-time +insights, improving SAM system performance. The study also includes a +comparative analysis of machine learning algorithms, illuminating their +capabilities and performance metrics and suggesting areas for future research, +highlighting the transformative potential of machine learning in SAM EZ +simulations. + +
+
+
+
+
+ + ☆ LLMs as Visual Explainers: Advancing Image Classification with Evolving + Visual Descriptions + + +
+ Vision-language models (VLMs) offer a promising paradigm for image +classification by comparing the similarity between images and class embeddings. +A critical challenge lies in crafting precise textual representations for class +names. While previous studies have leveraged recent advancements in large +language models (LLMs) to enhance these descriptors, their outputs often suffer +from ambiguity and inaccuracy. We identify two primary causes: 1) The prevalent +reliance on textual interactions with LLMs, leading to a mismatch between the +generated text and the visual content in VLMs' latent space - a phenomenon we +term the "explain without seeing" dilemma. 2) The oversight of the inter-class +relationships, resulting in descriptors that fail to differentiate similar +classes effectively. To address these issues, we propose a novel image +classification framework combining VLMs with LLMs, named Iterative Optimization +with Visual Feedback. In particular, our method develops an LLM-based agent, +employing an evolutionary optimization strategy to refine class descriptors. +Crucially, we incorporate visual feedback from VLM classification metrics, +thereby guiding the optimization process with concrete visual data. Our method +leads to improving accuracy on a wide range of image classification benchmarks, +with 3.47\% average gains over state-of-the-art methods. We also highlight the +resulting descriptions serve as explainable and robust features that can +consistently improve the performance across various backbone models. + +
+
+
+
+
+ + ☆ Measuring and Mitigating Biases in Motor Insurance Pricing + + +
+ The non-life insurance sector operates within a highly competitive and +tightly regulated framework, confronting a pivotal juncture in the formulation +of pricing strategies. Insurers are compelled to harness a range of statistical +methodologies and available data to construct optimal pricing structures that +align with the overarching corporate strategy while accommodating the dynamics +of market competition. Given the fundamental societal role played by insurance, +premium rates are subject to rigorous scrutiny by regulatory authorities. These +rates must conform to principles of transparency, explainability, and ethical +considerations. Consequently, the act of pricing transcends mere statistical +calculations and carries the weight of strategic and societal factors. These +multifaceted concerns may drive insurers to establish equitable premiums, +taking into account various variables. For instance, regulations mandate the +provision of equitable premiums, considering factors such as policyholder +gender or mutualist group dynamics in accordance with respective corporate +strategies. Age-based premium fairness is also mandated. In certain insurance +domains, variables such as the presence of serious illnesses or disabilities +are emerging as new dimensions for evaluating fairness. Regardless of the +motivating factor prompting an insurer to adopt fairer pricing strategies for a +specific variable, the insurer must possess the capability to define, measure, +and ultimately mitigate any ethical biases inherent in its pricing practices +while upholding standards of consistency and performance. This study seeks to +provide a comprehensive set of tools for these endeavors and assess their +effectiveness through practical application in the context of automobile +insurance. + +
+
+ comment: 37 pages +
+
+
+
+
+ + ☆ AMES: A Differentiable Embedding Space Selection Framework for Latent + Graph Inference + + +
+ In real-world scenarios, although data entities may possess inherent +relationships, the specific graph illustrating their connections might not be +directly accessible. Latent graph inference addresses this issue by enabling +Graph Neural Networks (GNNs) to operate on point cloud data, dynamically +learning the necessary graph structure. These graphs are often derived from a +latent embedding space, which can be modeled using Euclidean, hyperbolic, +spherical, or product spaces. However, currently, there is no principled +differentiable method for determining the optimal embedding space. In this +work, we introduce the Attentional Multi-Embedding Selection (AMES) framework, +a differentiable method for selecting the best embedding space for latent graph +inference through backpropagation, considering a downstream task. Our framework +consistently achieves comparable or superior results compared to previous +methods for latent graph inference across five benchmark datasets. Importantly, +our approach eliminates the need for conducting multiple experiments to +identify the optimal embedding space. Furthermore, we explore interpretability +techniques that track the gradient contributions of different latent graphs, +shedding light on how our attention-based, fully differentiable approach learns +to choose the appropriate latent space. In line with previous works, our +experiments emphasize the advantages of hyperbolic spaces in enhancing +performance. More importantly, our interpretability framework provides a +general approach for quantitatively comparing embedding spaces across different +tasks based on their contributions, a dimension that has been overlooked in +previous literature on latent graph inference. + +
+
+
+
+
+ + ☆ Efficient Neural Networks for Tiny Machine Learning: A Comprehensive + Review + + +
+ The field of Tiny Machine Learning (TinyML) has gained significant attention +due to its potential to enable intelligent applications on resource-constrained +devices. This review provides an in-depth analysis of the advancements in +efficient neural networks and the deployment of deep learning models on +ultra-low power microcontrollers (MCUs) for TinyML applications. It begins by +introducing neural networks and discussing their architectures and resource +requirements. It then explores MEMS-based applications on ultra-low power MCUs, +highlighting their potential for enabling TinyML on resource-constrained +devices. The core of the review centres on efficient neural networks for +TinyML. It covers techniques such as model compression, quantization, and +low-rank factorization, which optimize neural network architectures for minimal +resource utilization on MCUs. The paper then delves into the deployment of deep +learning models on ultra-low power MCUs, addressing challenges such as limited +computational capabilities and memory resources. Techniques like model pruning, +hardware acceleration, and algorithm-architecture co-design are discussed as +strategies to enable efficient deployment. Lastly, the review provides an +overview of current limitations in the field, including the trade-off between +model complexity and resource constraints. Overall, this review paper presents +a comprehensive analysis of efficient neural networks and deployment strategies +for TinyML on ultra-low-power MCUs. It identifies future research directions +for unlocking the full potential of TinyML applications on resource-constrained +devices. + +
+
+ comment: 39 pages, 9 figures, 5 tables +
+
+
+
+
+ + ☆ Multi-Task Faces (MTF) Data Set: A Legally and Ethically Compliant + Collection of Face Images for Various Classification Tasks + + +
+ Human facial data hold tremendous potential to address a variety of +classification problems, including face recognition, age estimation, gender +identification, emotion analysis, and race classification. However, recent +privacy regulations, such as the EU General Data Protection Regulation and +others, have restricted the ways in which human images may be collected and +used for research. As a result, several previously published data sets +containing human faces have been removed from the internet due to inadequate +data collection methods that failed to meet privacy regulations. Data sets +consisting of synthetic data have been proposed as an alternative, but they +fall short of accurately representing the real data distribution. On the other +hand, most available data sets are labeled for just a single task, which limits +their applicability. To address these issues, we present the Multi-Task Faces +(MTF) image data set, a meticulously curated collection of face images designed +for various classification tasks, including face recognition, as well as race, +gender, and age classification. The MTF data set has been ethically gathered by +leveraging publicly available images of celebrities and strictly adhering to +copyright regulations. In this paper, we present this data set and provide +detailed descriptions of the followed data collection and processing +procedures. Furthermore, we evaluate the performance of five deep learning (DL) +models on the MTF data set across the aforementioned classification tasks. +Additionally, we compare the performance of DL models over the processed MTF +data and over raw data crawled from the internet. The reported results +constitute a baseline for further research employing these data. The MTF data +set can be accessed through the following link (please cite the present paper +if you use the data set): https://github.com/RamiHaf/MTF_data_set + +
+
+ comment: 21 pages, 2 figures, 9 Tables, +
+
+
+
+
+ + ☆ Forward Gradients for Data-Driven CFD Wall Modeling + + +
+ Computational Fluid Dynamics (CFD) is used in the design and optimization of +gas turbines and many other industrial/ scientific applications. However, the +practical use is often limited by the high computational cost, and the accurate +resolution of near-wall flow is a significant contributor to this cost. Machine +learning (ML) and other data-driven methods can complement existing wall +models. Nevertheless, training these models is bottlenecked by the large +computational effort and memory footprint demanded by back-propagation. Recent +work has presented alternatives for computing gradients of neural networks +where a separate forward and backward sweep is not needed and storage of +intermediate results between sweeps is not required because an unbiased +estimator for the gradient is computed in a single forward sweep. In this +paper, we discuss the application of this approach for training a subgrid wall +model that could potentially be used as a surrogate in wall-bounded flow CFD +simulations to reduce the computational overhead while preserving predictive +accuracy. + +
+
+
+
+
+ + ☆ Training robust and generalizable quantum models + + +
+ Adversarial robustness and generalization are both crucial properties of +reliable machine learning models. In this paper, we study these properties in +the context of quantum machine learning based on Lipschitz bounds. We derive +tailored, parameter-dependent Lipschitz bounds for quantum models with +trainable encoding, showing that the norm of the data encoding has a crucial +impact on the robustness against perturbations in the input data. Further, we +derive a bound on the generalization error which explicitly depends on the +parameters of the data encoding. Our theoretical findings give rise to a +practical strategy for training robust and generalizable quantum models by +regularizing the Lipschitz bound in the cost. Further, we show that, for fixed +and non-trainable encodings as frequently employed in quantum machine learning, +the Lipschitz bound cannot be influenced by tuning the parameters. Thus, +trainable encodings are crucial for systematically adapting robustness and +generalization during training. With numerical results, we demonstrate that, +indeed, Lipschitz bound regularization leads to substantially more robust and +generalizable quantum models. + +
+
+
+
+
+ + ☆ Establishing Central Sensitization Inventory Cut-off Values in patients + with Chronic Low Back Pain by Unsupervised Machine Learning + + +
+ Human Assumed Central Sensitization is involved in the development and +maintenance of chronic low back pain (CLBP). The Central Sensitization +Inventory (CSI) was developed to evaluate the presence of HACS, with a cut-off +value of 40/100 based on patients with chronic pain. However, various factors +including pain conditions (e.g., CLBP), and gender may influence this cut-off +value. For chronic pain condition such as CLBP, unsupervised clustering +approaches can take these factors into consideration and automatically learn +the HACS-related patterns. Therefore, this study aimed to determine the cut-off +values for a Dutch-speaking population with CLBP, considering the total group +and stratified by gender based on unsupervised machine learning. In this study, +questionnaire data covering pain, physical, and psychological aspects were +collected from patients with CLBP and aged-matched pain-free adults (referred +to as healthy controls, HC). Four clustering approaches were applied to +identify HACS-related clusters based on the questionnaire data and gender. The +clustering performance was assessed using internal and external indicators. +Subsequently, receiver operating characteristic analysis was conducted on the +best clustering results to determine the optimal cut-off values. The study +included 151 subjects, consisting of 63 HCs and 88 patients with CLBP. +Hierarchical clustering yielded the best results, identifying three clusters: +healthy group, CLBP with low HACS level, and CLBP with high HACS level groups. +Based on the low HACS levels group (including HC and CLBP with low HACS level) +and high HACS level group, the cut-off value for the overall groups were 35, 34 +for females, and 35 for. The findings suggest that the optimal cut-off values +for CLBP is 35. The gender-related cut-off values should be interpreted with +caution due to the unbalanced gender distribution in the sample. + +
+
+ comment: 31 pages, 5 tables, 3 figures +
+
+
+
+
+ + ☆ Deep learning complete intersection Calabi-Yau manifolds + + +
+ We review advancements in deep learning techniques for complete intersection +Calabi-Yau (CICY) 3- and 4-folds, with the aim of understanding better how to +handle algebraic topological data with machine learning. We first discuss +methodological aspects and data analysis, before describing neural networks +architectures. Then, we describe the state-of-the art accuracy in predicting +Hodge numbers. We include new results on extrapolating predictions from low to +high Hodge numbers, and conversely. + +
+
+ comment: 19 pages; match version published in "Machine Learning in Pure + Mathematics and Theoretical Physics" (edited by Y.-H. He, World Scientific + Press) +
+
+
+
+
+ + ☆ High Probability Guarantees for Random Reshuffling + + +
+ We consider the stochastic gradient method with random reshuffling +($\mathsf{RR}$) for tackling smooth nonconvex optimization problems. +$\mathsf{RR}$ finds broad applications in practice, notably in training neural +networks. In this work, we first investigate the concentration property of +$\mathsf{RR}$'s sampling procedure and establish a new high probability sample +complexity guarantee for driving the gradient (without expectation) below +$\varepsilon$, which effectively characterizes the efficiency of a single +$\mathsf{RR}$ execution. Our derived complexity matches the best existing +in-expectation one up to a logarithmic term while imposing no additional +assumptions nor changing $\mathsf{RR}$'s updating rule. Furthermore, by +leveraging our derived high probability descent property and bound on the +stochastic error, we propose a simple and computable stopping criterion for +$\mathsf{RR}$ (denoted as $\mathsf{RR}$-$\mathsf{sc}$). This criterion is +guaranteed to be triggered after a finite number of iterations, and then +$\mathsf{RR}$-$\mathsf{sc}$ returns an iterate with its gradient below +$\varepsilon$ with high probability. Moreover, building on the proposed +stopping criterion, we design a perturbed random reshuffling method +($\mathsf{p}$-$\mathsf{RR}$) that involves an additional randomized +perturbation procedure near stationary points. We derive that +$\mathsf{p}$-$\mathsf{RR}$ provably escapes strict saddle points and +efficiently returns a second-order stationary point with high probability, +without making any sub-Gaussian tail-type assumptions on the stochastic +gradient errors. Finally, we conduct numerical experiments on neural network +training to support our theoretical findings. + +
+
+ comment: 21 pages, 3 figures +
+
+
+
+
+ + ☆ Kandinsky Conformal Prediction: Efficient Calibration of Image + Segmentation Algorithms + + +
+ Image segmentation algorithms can be understood as a collection of pixel +classifiers, for which the outcomes of nearby pixels are correlated. Classifier +models can be calibrated using Inductive Conformal Prediction, but this +requires holding back a sufficiently large calibration dataset for computing +the distribution of non-conformity scores of the model's predictions. If one +only requires only marginal calibration on the image level, this calibration +set consists of all individual pixels in the images available for calibration. +However, if the goal is to attain proper calibration for each individual pixel +classifier, the calibration set consists of individual images. In a scenario +where data are scarce (such as the medical domain), it may not always be +possible to set aside sufficiently many images for this pixel-level +calibration. The method we propose, dubbed ``Kandinsky calibration'', makes use +of the spatial structure present in the distribution of natural images to +simultaneously calibrate the classifiers of ``similar'' pixels. This can be +seen as an intermediate approach between marginal (imagewise) and conditional +(pixelwise) calibration, where non-conformity scores are aggregated over +similar image regions, thereby making more efficient use of the images +available for calibration. We run experiments on segmentation algorithms +trained and calibrated on subsets of the public MS-COCO and Medical Decathlon +datasets, demonstrating that Kandinsky calibration method can significantly +improve the coverage. When compared to both pixelwise and imagewise calibration +on little data, the Kandinsky method achieves much lower coverage errors, +indicating the data efficiency of the Kandinsky calibration. + +
+
+ comment: 15 pages, 11 figures +
+
+
+
+
+ + ☆ System 2 Attention (is something you might need too) + + +
+ Soft attention in Transformer-based Large Language Models (LLMs) is +susceptible to incorporating irrelevant information from the context into its +latent representations, which adversely affects next token generations. To help +rectify these issues, we introduce System 2 Attention (S2A), which leverages +the ability of LLMs to reason in natural language and follow instructions in +order to decide what to attend to. S2A regenerates the input context to only +include the relevant portions, before attending to the regenerated context to +elicit the final response. In experiments, S2A outperforms standard +attention-based LLMs on three tasks containing opinion or irrelevant +information, QA, math word problems and longform generation, where S2A +increases factuality and objectivity, and decreases sycophancy. + +
+
+
+
+
+ + ☆ Few-shot Multispectral Segmentation with Representations Generated by + Reinforcement Learning + + +
+ The task of multispectral image segmentation (segmentation of images with +numerous channels/bands, each capturing a specific range of wavelengths of +electromagnetic radiation) has been previously explored in contexts with large +amounts of labeled data. However, these models tend not to generalize well to +datasets of smaller size. In this paper, we propose a novel approach for +improving few-shot segmentation performance on multispectral images using +reinforcement learning to generate representations. These representations are +generated in the form of mathematical expressions between channels and are +tailored to the specific class being segmented. Our methodology involves +training an agent to identify the most informative expressions, updating the +dataset using these expressions, and then using the updated dataset to perform +segmentation. Due to the limited length of the expressions, the model receives +useful representations without any added risk of overfitting. We evaluate the +effectiveness of our approach on several multispectral datasets and demonstrate +its effectiveness in boosting the performance of segmentation algorithms. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Zero redundancy distributed learning with differential privacy + + +
+ Deep learning using large models have achieved great success in a wide range +of domains. However, training these models on billions of parameters is very +challenging in terms of the training speed, memory cost, and communication +efficiency, especially under the privacy-preserving regime with differential +privacy (DP). On the one hand, DP optimization has comparable efficiency to the +standard non-private optimization on a single GPU, but on multiple GPUs, +existing DP distributed learning (such as pipeline parallel) has suffered from +significantly worse efficiency. On the other hand, the Zero Redundancy +Optimizer (ZeRO) is a state-of-the-art solution to the standard distributed +learning, exhibiting excellent training efficiency on large models, but to work +compatibly with DP is technically complicated. In this work, we develop a new +systematic solution, DP-ZeRO, (I) to scale up the trainable DP model size, e.g. +to GPT-100B, (II) to obtain the same computation and communication efficiency +as the standard ZeRO, and (III) to enable mixed-precision DP training. Our +DP-ZeRO, like the standard ZeRO, has the potential to train models with +arbitrary size and is evaluated on the world's largest DP models in terms of +the number of trainable parameters. + +
+
+
+
+
+ + ☆ Cross-View Graph Consistency Learning for Invariant Graph + Representations + + +
+ Graph representation learning is fundamental for analyzing graph-structured +data. Exploring invariant graph representations remains a challenge for most +existing graph representation learning methods. In this paper, we propose a +cross-view graph consistency learning (CGCL) method that learns invariant graph +representations for link prediction. First, two complementary augmented views +are derived from an incomplete graph structure through a bidirectional graph +structure augmentation scheme. This augmentation scheme mitigates the potential +information loss that is commonly associated with various data augmentation +techniques involving raw graph data, such as edge perturbation, node removal, +and attribute masking. Second, we propose a CGCL model that can learn invariant +graph representations. A cross-view training scheme is proposed to train the +proposed CGCL model. This scheme attempts to maximize the consistency +information between one augmented view and the graph structure reconstructed +from the other augmented view. Furthermore, we offer a comprehensive +theoretical CGCL analysis. This paper empirically and experimentally +demonstrates the effectiveness of the proposed CGCL method, achieving +competitive results on graph datasets in comparisons with several +state-of-the-art algorithms. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Generalized super-resolution 4D Flow MRI -- using ensemble learning to + extend across the cardiovascular system + + +
+ 4D Flow Magnetic Resonance Imaging (4D Flow MRI) is a non-invasive +measurement technique capable of quantifying blood flow across the +cardiovascular system. While practical use is limited by spatial resolution and +image noise, incorporation of trained super-resolution (SR) networks has +potential to enhance image quality post-scan. However, these efforts have +predominantly been restricted to narrowly defined cardiovascular domains, with +limited exploration of how SR performance extends across the cardiovascular +system; a task aggravated by contrasting hemodynamic conditions apparent across +the cardiovasculature. The aim of our study was to explore the generalizability +of SR 4D Flow MRI using a combination of heterogeneous training sets and +dedicated ensemble learning. With synthetic training data generated across +three disparate domains (cardiac, aortic, cerebrovascular), varying +convolutional base and ensemble learners were evaluated as a function of domain +and architecture, quantifying performance on both in-silico and acquired +in-vivo data from the same three domains. Results show that both bagging and +stacking ensembling enhance SR performance across domains, accurately +predicting high-resolution velocities from low-resolution input data in-silico. +Likewise, optimized networks successfully recover native resolution velocities +from downsampled in-vivo data, as well as show qualitative potential in +generating denoised SR-images from clinical level input data. In conclusion, +our work presents a viable approach for generalized SR 4D Flow MRI, with +ensemble learning extending utility across various clinical areas of interest. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ LogLead -- Fast and Integrated Log Loader, Enhancer, and Anomaly + Detector + + +
+ This paper introduces LogLead, a tool designed for efficient log analysis. +LogLead combines three essential steps in log processing: loading, enhancing, +and anomaly detection. The tool leverages Polars, a high-speed DataFrame +library. We currently have 7 Loaders out of which 4 is for public data sets +(HDFS, Hadoop, BGL, and Thunderbird). We have multiple enhancers with three +parsers (Drain, Spell, LenMa), Bert embedding creation and other log +representation techniques like bag-of-words. LogLead integrates to 5 supervised +and 4 unsupervised machine learning algorithms for anomaly detection from +SKLearn. By integrating diverse datasets, log representation methods and +anomaly detectors, LogLead facilitates comprehensive benchmarking in log +analysis research. We demonstrate that log loading from raw file to dataframe +is over 10x faster with LogLead is compared to past solutions. We demonstrate +roughly 2x improvement in Drain parsing speed by off-loading log message +normalization to LogLead. We demonstrate a brief benchmarking on HDFS +suggesting that log representations beyond bag-of-words provide limited +benefits. Screencast demonstrating the tool: https://youtu.be/8stdbtTfJVo + +
+
+
+
+
+ + ☆ Operator Learning for Continuous Spatial-Temporal Model with A Hybrid + Optimization Scheme + + +
+ Partial differential equations are often used in the spatial-temporal +modeling of complex dynamical systems in many engineering applications. In this +work, we build on the recent progress of operator learning and present a +data-driven modeling framework that is continuous in both space and time. A key +feature of the proposed model is the resolution-invariance with respect to both +spatial and temporal discretizations. To improve the long-term performance of +the calibrated model, we further propose a hybrid optimization scheme that +leverages both gradient-based and derivative-free optimization methods and +efficiently trains on both short-term time series and long-term statistics. We +investigate the performance of the spatial-temporal continuous learning +framework with three numerical examples, including the viscous Burgers' +equation, the Navier-Stokes equations, and the Kuramoto-Sivashinsky equation. +The results confirm the resolution-invariance of the proposed modeling +framework and also demonstrate stable long-term simulations with only +short-term time series data. In addition, we show that the proposed model can +better predict long-term statistics via the hybrid optimization scheme with a +combined use of short-term and long-term data. + +
+
+
+
+
+ + ☆ Approximate Linear Programming and Decentralized Policy Improvement in + Cooperative Multi-agent Markov Decision Processes + + +
+ In this work, we consider a `cooperative' multi-agent Markov decision process +(MDP) involving m greater than 1 agents, where all agents are aware of the +system model. At each decision epoch, all the m agents cooperatively select +actions in order to maximize a common long-term objective. Since the number of +actions grows exponentially in the number of agents, policy improvement is +computationally expensive. Recent works have proposed using decentralized +policy improvement in which each agent assumes that the decisions of the other +agents are fixed and it improves its decisions unilaterally. Yet, in these +works, exact values are computed. In our work, for cooperative multi-agent +finite and infinite horizon discounted MDPs, we propose suitable approximate +policy iteration algorithms, wherein we use approximate linear programming to +compute the approximate value function and use decentralized policy +improvement. Thus our algorithms can handle both large number of states as well +as multiple agents. We provide theoretical guarantees for our algorithms and +also demonstrate the performance of our algorithms on some numerical examples. + +
+
+
+
+
+ + ☆ Robust Tumor Segmentation with Hyperspectral Imaging and Graph Neural + Networks + + +
+ Segmenting the boundary between tumor and healthy tissue during surgical +cancer resection poses a significant challenge. In recent years, Hyperspectral +Imaging (HSI) combined with Machine Learning (ML) has emerged as a promising +solution. However, due to the extensive information contained within the +spectral domain, most ML approaches primarily classify individual HSI +(super-)pixels, or tiles, without taking into account their spatial context. In +this paper, we propose an improved methodology that leverages the spatial +context of tiles for more robust and smoother segmentation. To address the +irregular shapes of tiles, we utilize Graph Neural Networks (GNNs) to propagate +context information across neighboring regions. The features for each tile +within the graph are extracted using a Convolutional Neural Network (CNN), +which is trained simultaneously with the subsequent GNN. Moreover, we +incorporate local image quality metrics into the loss function to enhance the +training procedure's robustness against low-quality regions in the training +images. We demonstrate the superiority of our proposed method using a clinical +ex vivo dataset consisting of 51 HSI images from 30 patients. Despite the +limited dataset, the GNN-based model significantly outperforms context-agnostic +approaches, accurately distinguishing between healthy and tumor tissues, even +in images from previously unseen patients. Furthermore, we show that our +carefully designed loss function, accounting for local image quality, results +in additional improvements. Our findings demonstrate that context-aware GNN +algorithms can robustly find tumor demarcations on HSI images, ultimately +contributing to better surgery success and patient outcome. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ☆ Multimodal deep learning for mapping forest dominant height by fusing + GEDI with earth observation data + + +
+ The integration of multisource remote sensing data and deep learning models +offers new possibilities for accurately mapping high spatial resolution forest +height. We found that GEDI relative heights (RH) metrics exhibited strong +correlation with the mean of the top 10 highest trees (dominant height) +measured in situ at the corresponding footprint locations. Consequently, we +proposed a novel deep learning framework termed the multi-modal attention +remote sensing network (MARSNet) to estimate forest dominant height by +extrapolating dominant height derived from GEDI, using Setinel-1 data, ALOS-2 +PALSAR-2 data, Sentinel-2 optical data and ancillary data. MARSNet comprises +separate encoders for each remote sensing data modality to extract multi-scale +features, and a shared decoder to fuse the features and estimate height. Using +individual encoders for each remote sensing imagery avoids interference across +modalities and extracts distinct representations. To focus on the efficacious +information from each dataset, we reduced the prevalent spatial and band +redundancies in each remote sensing data by incorporating the extended spatial +and band reconstruction convolution modules in the encoders. MARSNet achieved +commendable performance in estimating dominant height, with an R2 of 0.62 and +RMSE of 2.82 m, outperforming the widely used random forest approach which +attained an R2 of 0.55 and RMSE of 3.05 m. Finally, we applied the trained +MARSNet model to generate wall-to-wall maps at 10 m resolution for Jilin, +China. Through independent validation using field measurements, MARSNet +demonstrated an R2 of 0.58 and RMSE of 3.76 m, compared to 0.41 and 4.37 m for +the random forest baseline. Our research demonstrates the effectiveness of a +multimodal deep learning approach fusing GEDI with SAR and passive optical +imagery for enhancing the accuracy of high resolution dominant height +estimation. + +
+
+
+
+
+ + ☆ A Good Feature Extractor Is All You Need for Weakly Supervised Learning + in Histopathology + + +
+ Deep learning is revolutionising pathology, offering novel opportunities in +disease prognosis and personalised treatment. Historically, stain normalisation +has been a crucial preprocessing step in computational pathology pipelines, and +persists into the deep learning era. Yet, with the emergence of feature +extractors trained using self-supervised learning (SSL) on diverse pathology +datasets, we call this practice into question. In an empirical evaluation of +publicly available feature extractors, we find that omitting stain +normalisation and image augmentations does not compromise downstream +performance, while incurring substantial savings in memory and compute. +Further, we show that the top-performing feature extractors are remarkably +robust to variations in stain and augmentations like rotation in their latent +space. Contrary to previous patch-level benchmarking studies, our approach +emphasises clinical relevance by focusing on slide-level prediction tasks in a +weakly supervised setting with external validation cohorts. This work +represents the most comprehensive robustness evaluation of public pathology SSL +feature extractors to date, involving more than 6,000 training runs across nine +tasks, five datasets, three downstream architectures, and various preprocessing +setups. Our findings stand to streamline digital pathology workflows by +minimising preprocessing needs and informing the selection of feature +extractors. + +
+
+
+
+
+ + ☆ MUVO: A Multimodal Generative World Model for Autonomous Driving with + Geometric Representations + + +
+ Learning unsupervised world models for autonomous driving has the potential +to improve the reasoning capabilities of today's systems dramatically. However, +most work neglects the physical attributes of the world and focuses on sensor +data alone. We propose MUVO, a MUltimodal World Model with Geometric VOxel +Representations to address this challenge. We utilize raw camera and lidar data +to learn a sensor-agnostic geometric representation of the world, which can +directly be used by downstream tasks, such as planning. We demonstrate +multimodal future predictions and show that our geometric representation +improves the prediction quality of both camera images and lidar point clouds. + +
+
+
+
+
+ + ☆ Unveiling the Unseen Potential of Graph Learning through MLPs: Effective + Graph Learners Using Propagation-Embracing MLPs + + +
+ Recent studies attempted to utilize multilayer perceptrons (MLPs) to solve +semi-supervised node classification on graphs, by training a student MLP by +knowledge distillation (KD) from a teacher graph neural network (GNN). While +previous studies have focused mostly on training the student MLP by matching +the output probability distributions between the teacher and student models +during KD, it has not been systematically studied how to inject the structural +information in an explicit and interpretable manner. Inspired by GNNs that +separate feature transformation $T$ and propagation $\Pi$, we re-frame the KD +process as enabling the student MLP to explicitly learn both $T$ and $\Pi$. +Although this can be achieved by applying the inverse propagation $\Pi^{-1}$ +before distillation from the teacher GNN, it still comes with a high +computational cost from large matrix multiplications during training. To solve +this problem, we propose Propagate & Distill (P&D), which propagates the output +of the teacher GNN before KD and can be interpreted as an approximate process +of the inverse propagation $\Pi^{-1}$. Through comprehensive evaluations using +real-world benchmark datasets, we demonstrate the effectiveness of P&D by +showing further performance boost of the student MLP. + +
+
+ comment: 35 pages, 5 figures, 8 tables +
+
+
+
+
+ + ☆ Revealing behavioral impact on mobility prediction networks through + causal interventions + + +
+ Deep neural networks are increasingly utilized in mobility prediction tasks, +yet their intricate internal workings pose challenges for interpretability, +especially in comprehending how various aspects of mobility behavior affect +predictions. In this study, we introduce a causal intervention framework to +assess the impact of mobility-related factors on neural networks designed for +next location prediction -- a task focusing on predicting the immediate next +location of an individual. To achieve this, we employ individual mobility +models to generate synthetic location visit sequences and control behavior +dynamics by intervening in their data generation process. We evaluate the +interventional location sequences using mobility metrics and input them into +well-trained networks to analyze performance variations. The results +demonstrate the effectiveness in producing location sequences with distinct +mobility behaviors, thus facilitating the simulation of diverse spatial and +temporal changes. These changes result in performance fluctuations in next +location prediction networks, revealing impacts of critical mobility behavior +factors, including sequential patterns in location transitions, proclivity for +exploring new locations, and preferences in location choices at population and +individual levels. The gained insights hold significant value for the +real-world application of mobility prediction networks, and the framework is +expected to promote the use of causal inference for enhancing the +interpretability and robustness of neural networks in mobility applications. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ☆ Leveraging Uncertainty Estimates To Improve Classifier Performance + + +
+ Binary classification involves predicting the label of an instance based on +whether the model score for the positive class exceeds a threshold chosen based +on the application requirements (e.g., maximizing recall for a precision +bound). However, model scores are often not aligned with the true positivity +rate. This is especially true when the training involves a differential +sampling across classes or there is distributional drift between train and test +settings. In this paper, we provide theoretical analysis and empirical evidence +of the dependence of model score estimation bias on both uncertainty and score +itself. Further, we formulate the decision boundary selection in terms of both +model score and uncertainty, prove that it is NP-hard, and present algorithms +based on dynamic programming and isotonic regression. Evaluation of the +proposed algorithms on three real-world datasets yield 25%-40% gain in recall +at high precision bounds over the traditional approach of using model score +alone, highlighting the benefits of leveraging uncertainty. + +
+
+
+
+
+ + ☆ Can we infer the presence of Differential Privacy in Deep Learning + models' weights? Towards more secure Deep Learning + + +
+ Differential Privacy (DP) is a key property to protect data and models from +integrity attacks. In the Deep Learning (DL) field, it is commonly implemented +through the Differentially Private Stochastic Gradient Descent (DP-SGD). +However, when a model is shared or released, there is no way to check whether +it is differentially private, that is, it required to trust the model provider. +This situation poses a problem when data privacy is mandatory, specially with +current data regulations, as the presence of DP can not be certificated +consistently by any third party. Thus, we face the challenge of determining +whether a DL model has been trained with DP, according to the title question: +Can we infer the presence of Differential Privacy in Deep Learning models' +weights? Since the DP-SGD significantly changes the training process of a DL +model, we hypothesize that DP leaves an imprint in the weights of a DL model, +which can be used to predict whether a model has been trained with DP +regardless of its architecture and the training dataset. In this paper, we +propose to employ the imprint in model weights of using DP to infer the +presence of DP training in a DL model. To substantiate our hypothesis, we +developed an experimental methodology based on two datasets of weights of DL +models, each with models with and without DP training and a meta-classifier to +infer whether DP was used in the training process of a DL model, by accessing +its weights. We accomplish both, the removal of the requirement of a trusted +model provider and a strong foundation for this interesting line of research. +Thus, our contribution is an additional layer of security on top of the strict +private requirements of DP training in DL models, towards to DL models. + +
+
+
+
+
+ + ☆ Sparse Low-rank Adaptation of Pre-trained Language Models EMNLP 2023 + + +
+ Fine-tuning pre-trained large language models in a parameter-efficient manner +is widely studied for its effectiveness and efficiency. The popular method of +low-rank adaptation (LoRA) offers a notable approach, hypothesizing that the +adaptation process is intrinsically low-dimensional. Although LoRA has +demonstrated commendable performance, it is implemented with a fixed and +unalterable intrinsic rank that might not always be the ideal choice. +Recognizing the need for more flexible adaptation, we extend the methodology of +LoRA to an innovative approach we call sparse low-rank adaptation (SoRA) that +enables dynamic adjustments to the intrinsic rank during the adaptation +process. We achieve this through the incorporation of a gate unit optimized +with proximal gradient method in the training stage, controlling the +cardinality of rank under the sparsity of the gate. In the subsequent inference +stage, we eliminate the parameter blocks corresponding to the zeroed-out ranks, +to reduce each SoRA module back to a concise yet rank-optimal LoRA. Our +approach strengthens the representation power of LoRA by initializing it with a +higher rank, while efficiently taming a temporarily increased number of +parameters via updating in a sparse way. We further introduce a sparsifying +scheduler for SoRA, aiming to examine the impact of the number of non-zero +parameters on the model's memorization and generalization. Our experimental +results demonstrate that SoRA can outperform other baselines even with 70% +retained parameters and 70% training time. + +
+
+ comment: Accepted to EMNLP 2023 (Main Conference) +
+
+
+
+
+ + ☆ Unveiling the Power of Self-Attention for Shipping Cost Prediction: The + Rate Card Transformer + + +
+ Amazon ships billions of packages to its customers annually within the United +States. Shipping cost of these packages are used on the day of shipping (day 0) +to estimate profitability of sales. Downstream systems utilize these days 0 +profitability estimates to make financial decisions, such as pricing strategies +and delisting loss-making products. However, obtaining accurate shipping cost +estimates on day 0 is complex for reasons like delay in carrier invoicing or +fixed cost components getting recorded at monthly cadence. Inaccurate shipping +cost estimates can lead to bad decision, such as pricing items too low or high, +or promoting the wrong product to the customers. Current solutions for +estimating shipping costs on day 0 rely on tree-based models that require +extensive manual engineering efforts. In this study, we propose a novel +architecture called the Rate Card Transformer (RCT) that uses self-attention to +encode all package shipping information such as package attributes, carrier +information and route plan. Unlike other transformer-based tabular models, RCT +has the ability to encode a variable list of one-to-many relations of a +shipment, allowing it to capture more information about a shipment. For +example, RCT can encode properties of all products in a package. Our results +demonstrate that cost predictions made by the RCT have 28.82% less error +compared to tree-based GBDT model. Moreover, the RCT outperforms the +state-of-the-art transformer-based tabular model, FTTransformer, by 6.08%. We +also illustrate that the RCT learns a generalized manifold of the rate card +that can improve the performance of tree-based models. + +
+
+
+
+
+ + ☆ Unraveling the Control Engineer's Craft with Neural Networks + + +
+ Many industrial processes require suitable controllers to meet their +performance requirements. More often, a sophisticated digital twin is +available, which is a highly complex model that is a virtual representation of +a given physical process, whose parameters may not be properly tuned to capture +the variations in the physical process. In this paper, we present a sim2real, +direct data-driven controller tuning approach, where the digital twin is used +to generate input-output data and suitable controllers for several +perturbations in its parameters. State-of-the art neural-network architectures +are then used to learn the controller tuning rule that maps input-output data +onto the controller parameters, based on artificially generated data from +perturbed versions of the digital twin. In this way, as far as we are aware, we +tackle for the first time the problem of re-calibrating the controller by +meta-learning the tuning rule directly from data, thus practically replacing +the control engineer with a machine learning model. The benefits of this +methodology are illustrated via numerical simulations for several choices of +neural-network architectures. + +
+
+ comment: 6 pages +
+
+
+
+
+ + ☆ Generating Realistic Counterfactuals for Retinal Fundus and OCT Images + using Diffusion Models + + +
+ Counterfactual reasoning is often used in a clinical setting to explain +decisions or weigh alternatives. Therefore, for imaging based modalities such +as ophthalmology, it would be beneficial to be able to create counterfactual +images, illustrating the answer to the question: "If the subject had had +diabetic retinopathy, how would the fundus image have looked?" Here, we +demonstrate that using a diffusion model in combination with an adversarially +robust classifier trained on retinal disease classification tasks enables +generation of highly realistic counterfactuals of retinal fundus images and +optical coherence tomorgraphy (OCT) B-scans. Ideally, these classifiers encode +the salient features indicative for each disease class and can steer the +diffusion model to show realistic disease signs or remove disease-related +lesions in a realistic way. Importantly, in a user study, domain experts found +the counterfactuals generated using our method significantly more realistic +than counterfactuals generated from a previous method, and even +indistiguishable from realistic images. + +
+
+
+
+
+ + ☆ Incorporating LLM Priors into Tabular Learners NeurIPS 2023 + + +
+ We present a method to integrate Large Language Models (LLMs) and traditional +tabular data classification techniques, addressing LLMs challenges like data +serialization sensitivity and biases. We introduce two strategies utilizing +LLMs for ranking categorical variables and generating priors on correlations +between continuous variables and targets, enhancing performance in few-shot +scenarios. We focus on Logistic Regression, introducing MonotonicLR that +employs a non-linear monotonic function for mapping ordinals to cardinals while +preserving LLM-determined orders. Validation against baseline models reveals +the superior performance of our approach, especially in low-data scenarios, +while remaining interpretable. + +
+
+ comment: Table Representation Learning Workshop at NeurIPS 2023 +
+
+
+
+
+ + ☆ A novel transformer-based approach for soil temperature prediction + + +
+ Soil temperature is one of the most significant parameters that plays a +crucial role in glacier energy, dynamics of mass balance, processes of surface +hydrological, coaction of glacier-atmosphere, nutrient cycling, ecological +stability, the management of soil, water, and field crop. In this work, we +introduce a novel approach using transformer models for the purpose of +forecasting soil temperature prediction. To the best of our knowledge, the +usage of transformer models in this work is the very first attempt to predict +soil temperature. Experiments are carried out using six different FLUXNET +stations by modeling them with five different transformer models, namely, +Vanilla Transformer, Informer, Autoformer, Reformer, and ETSformer. To +demonstrate the effectiveness of the proposed model, experiment results are +compared with both deep learning approaches and literature studies. Experiment +results show that the utilization of transformer models ensures a significant +contribution to the literature, thence determining the new state-of-the-art. + +
+
+
+
+
+ + ☆ Testing multivariate normality by testing independence + + +
+ We propose a simple multivariate normality test based on Kac-Bernstein's +characterization, which can be conducted by utilising existing statistical +independence tests for sums and differences of data samples. We also perform +its empirical investigation, which reveals that for high-dimensional data, the +proposed approach may be more efficient than the alternative ones. The +accompanying code repository is provided at \url{https://shorturl.at/rtuy5}. + +
+
+ comment: 6 pages, 1 figure +
+
+
+
+
+ + ☆ A Deep-Genetic Algorithm (Deep-GA) Approach for High-Dimensional + Nonlinear Parabolic Partial Differential Equations + + +
+ We propose a new method, called a deep-genetic algorithm (deep-GA), to +accelerate the performance of the so-called deep-BSDE method, which is a deep +learning algorithm to solve high dimensional partial differential equations +through their corresponding backward stochastic differential equations (BSDEs). +Recognizing the sensitivity of the solver to the initial guess selection, we +embed a genetic algorithm (GA) into the solver to optimize the selection. We +aim to achieve faster convergence for the nonlinear PDEs on a broader interval +than deep-BSDE. Our proposed method is applied to two nonlinear parabolic PDEs, +i.e., the Black-Scholes (BS) equation with default risk and the +Hamilton-Jacobi-Bellman (HJB) equation. We compare the results of our method +with those of the deep-BSDE and show that our method provides comparable +accuracy with significantly improved computational efficiency. + +
+
+ comment: Accepted for publication in Computers and Mathematics with + Applications, 19 pages, 6 figures +
+
+
+
+
+ + ☆ Replay-enhanced Continual Reinforcement Learning + + +
+ Replaying past experiences has proven to be a highly effective approach for +averting catastrophic forgetting in supervised continual learning. However, +some crucial factors are still largely ignored, making it vulnerable to serious +failure, when used as a solution to forgetting in continual reinforcement +learning, even in the context of perfect memory where all data of previous +tasks are accessible in the current task. On the one hand, since most +reinforcement learning algorithms are not invariant to the reward scale, the +previously well-learned tasks (with high rewards) may appear to be more salient +to the current learning process than the current task (with small initial +rewards). This causes the agent to concentrate on those salient tasks at the +expense of generality on the current task. On the other hand, offline learning +on replayed tasks while learning a new task may induce a distributional shift +between the dataset and the learned policy on old tasks, resulting in +forgetting. In this paper, we introduce RECALL, a replay-enhanced method that +greatly improves the plasticity of existing replay-based methods on new tasks +while effectively avoiding the recurrence of catastrophic forgetting in +continual reinforcement learning. RECALL leverages adaptive normalization on +approximate targets and policy distillation on old tasks to enhance generality +and stability, respectively. Extensive experiments on the Continual World +benchmark show that RECALL performs significantly better than purely perfect +memory replay, and achieves comparable or better overall performance against +state-of-the-art continual learning methods. + +
+
+ comment: Accepted by Transactions on Machine Learning Research 2023 +
+
+
+
+
+ + ☆ Exploring Prompting Large Language Models as Explainable Metrics + + +
+ This paper describes the IUST NLP Lab submission to the Prompting Large +Language Models as Explainable Metrics Shared Task at the Eval4NLP 2023 +Workshop on Evaluation & Comparison of NLP Systems. We have proposed a +zero-shot prompt-based strategy for explainable evaluation of the summarization +task using Large Language Models (LLMs). The conducted experiments demonstrate +the promising potential of LLMs as evaluation metrics in Natural Language +Processing (NLP), particularly in the field of summarization. Both few-shot and +zero-shot approaches are employed in these experiments. The performance of our +best provided prompts achieved a Kendall correlation of 0.477 with human +evaluations in the text summarization task on the test data. Code and results +are publicly available on GitHub. + +
+
+ comment: 9 pages, Eval4NLP 2023 +
+
+
+
+
+ + ☆ Understanding Variation in Subpopulation Susceptibility to Poisoning + Attacks + + +
+ Machine learning is susceptible to poisoning attacks, in which an attacker +controls a small fraction of the training data and chooses that data with the +goal of inducing some behavior unintended by the model developer in the trained +model. We consider a realistic setting in which the adversary with the ability +to insert a limited number of data points attempts to control the model's +behavior on a specific subpopulation. Inspired by previous observations on +disparate effectiveness of random label-flipping attacks on different +subpopulations, we investigate the properties that can impact the effectiveness +of state-of-the-art poisoning attacks against different subpopulations. For a +family of 2-dimensional synthetic datasets, we empirically find that dataset +separability plays a dominant role in subpopulation vulnerability for less +separable datasets. However, well-separated datasets exhibit more dependence on +individual subpopulation properties. We further discover that a crucial +subpopulation property is captured by the difference in loss on the clean +dataset between the clean model and a target model that misclassifies the +subpopulation, and a subpopulation is much easier to attack if the loss +difference is small. This property also generalizes to high-dimensional +benchmark datasets. For the Adult benchmark dataset, we show that we can find +semantically-meaningful subpopulation properties that are related to the +susceptibilities of a selected group of subpopulations. The results in this +paper are accompanied by a fully interactive web-based visualization of +subpopulation poisoning attacks found at +https://uvasrg.github.io/visualizing-poisoning + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ☆ ADAPTER-RL: Adaptation of Any Agent using Reinforcement Learning + + +
+ Deep Reinforcement Learning (DRL) agents frequently face challenges in +adapting to tasks outside their training distribution, including issues with +over-fitting, catastrophic forgetting and sample inefficiency. Although the +application of adapters has proven effective in supervised learning contexts +such as natural language processing and computer vision, their potential within +the DRL domain remains largely unexplored. This paper delves into the +integration of adapters in reinforcement learning, presenting an innovative +adaptation strategy that demonstrates enhanced training efficiency and +improvement of the base-agent, experimentally in the nanoRTS environment, a +real-time strategy (RTS) game simulation. Our proposed universal approach is +not only compatible with pre-trained neural networks but also with rule-based +agents, offering a means to integrate human expertise. + +
+
+
+
+
+ + ☆ Optimal Hyperparameter $ε$ for Adaptive Stochastic Optimizers + through Gradient Histograms + + +
+ Optimizers are essential components for successfully training deep neural +network models. In order to achieve the best performance from such models, +designers need to carefully choose the optimizer hyperparameters. However, this +can be a computationally expensive and time-consuming process. Although it is +known that all optimizer hyperparameters must be tuned for maximum performance, +there is still a lack of clarity regarding the individual influence of minor +priority hyperparameters, including the safeguard factor $\epsilon$ and +momentum factor $\beta$, in leading adaptive optimizers (specifically, those +based on the Adam optimizers). In this manuscript, we introduce a new framework +based on gradient histograms to analyze and justify important attributes of +adaptive optimizers, such as their optimal performance and the relationships +and dependencies among hyperparameters. Furthermore, we propose a novel +gradient histogram-based algorithm that automatically estimates a reduced and +accurate search space for the safeguard hyperparameter $\epsilon$, where the +optimal value can be easily found. + +
+
+
+
+
+ + ☆ Liver Tumor Prediction with Advanced Attention Mechanisms Integrated + into a Depth-Based Variant Search Algorithm + + +
+ In recent days, Deep Learning (DL) techniques have become an emerging +transformation in the field of machine learning, artificial intelligence, +computer vision, and so on. Subsequently, researchers and industries have been +highly endorsed in the medical field, predicting and controlling diverse +diseases at specific intervals. Liver tumor prediction is a vital chore in +analyzing and treating liver diseases. This paper proposes a novel approach for +predicting liver tumors using Convolutional Neural Networks (CNN) and a +depth-based variant search algorithm with advanced attention mechanisms +(CNN-DS-AM). The proposed work aims to improve accuracy and robustness in +diagnosing and treating liver diseases. The anticipated model is assessed on a +Computed Tomography (CT) scan dataset containing both benign and malignant +liver tumors. The proposed approach achieved high accuracy in predicting liver +tumors, outperforming other state-of-the-art methods. Additionally, advanced +attention mechanisms were incorporated into the CNN model to enable the +identification and highlighting of regions of the CT scans most relevant to +predicting liver tumors. The results suggest that incorporating attention +mechanisms and a depth-based variant search algorithm into the CNN model is a +promising approach for improving the accuracy and robustness of liver tumor +prediction. It can assist radiologists in their diagnosis and treatment +planning. The proposed system achieved a high accuracy of 95.5% in predicting +liver tumors, outperforming other state-of-the-art methods. + +
+
+
+
+
+ + ☆ Multi-teacher Distillation for Multilingual Spelling Correction + + +
+ Accurate spelling correction is a critical step in modern search interfaces, +especially in an era of mobile devices and speech-to-text interfaces. For +services that are deployed around the world, this poses a significant challenge +for multilingual NLP: spelling errors need to be caught and corrected in all +languages, and even in queries that use multiple languages. In this paper, we +tackle this challenge using multi-teacher distillation. On our approach, a +monolingual teacher model is trained for each language/locale, and these +individual models are distilled into a single multilingual student model +intended to serve all languages/locales. In experiments using open-source data +as well as user data from a worldwide search service, we show that this leads +to highly effective spelling correction models that can meet the tight latency +requirements of deployed services. + +
+
+
+
+
+ + ☆ Token-Level Adversarial Prompt Detection Based on Perplexity Measures + and Contextual Information + + +
+ In recent years, Large Language Models (LLM) have emerged as pivotal tools in +various applications. However, these models are susceptible to adversarial +prompt attacks, where attackers can carefully curate input strings that lead to +undesirable outputs. The inherent vulnerability of LLMs stems from their +input-output mechanisms, especially when presented with intensely +out-of-distribution (OOD) inputs. This paper proposes a token-level detection +method to identify adversarial prompts, leveraging the LLM's capability to +predict the next token's probability. We measure the degree of the model's +perplexity and incorporate neighboring token information to encourage the +detection of contiguous adversarial prompt sequences. As a result, we propose +two methods: one that identifies each token as either being part of an +adversarial prompt or not, and another that estimates the probability of each +token being part of an adversarial prompt. + +
+
+
+
+
+ + ☆ MultiLoRA: Democratizing LoRA for Better Multi-Task Learning + + +
+ LoRA achieves remarkable resource efficiency and comparable performance when +adapting LLMs for specific tasks. Since ChatGPT demonstrated superior +performance on various tasks, there has been a growing desire to adapt one +model for all tasks. However, the explicit low-rank of LoRA limits the +adaptation performance in complex multi-task scenarios. LoRA is dominated by a +small number of top singular vectors while fine-tuning decomposes into a set of +less important unitary transforms. In this paper, we propose MultiLoRA for +better multi-task adaptation by reducing the dominance of top singular vectors +observed in LoRA. MultiLoRA scales LoRA modules horizontally and change +parameter initialization of adaptation matrices to reduce parameter dependency, +thus yields more balanced unitary subspaces. We unprecedentedly construct +specialized training data by mixing datasets of instruction follow, natural +language understanding, world knowledge, to cover semantically and +syntactically different samples. With only 2.5% of additional parameters, +MultiLoRA outperforms single LoRA counterparts and fine-tuning on multiple +benchmarks and model scales. Further investigation into weight update matrices +of MultiLoRA exhibits reduced dependency on top singular vectors and more +democratic unitary transform contributions. + +
+
+
+
+
+ + ☆ Interpretability in Machine Learning: on the Interplay with + Explainability, Predictive Performances and Models + + +
+ Interpretability has recently gained attention in the field of machine +learning, for it is crucial when it comes to high-stakes decisions or +troubleshooting. This abstract concept is hard to grasp and has been +associated, over time, with many labels and preconceived ideas. In this +position paper, in order to clarify some misunderstandings regarding +interpretability, we discuss its relationship with significant concepts in +machine learning: explainability, predictive performances, and machine learning +models. For instance, we challenge the idea that interpretability and +explainability are substitutes to one another, or that a fixed degree of +interpretability can be associated with a given machine learning model. + +
+
+
+
+
+ + ☆ An NMF-Based Building Block for Interpretable Neural Networks With + Continual Learning + + +
+ Existing learning methods often struggle to balance interpretability and +predictive performance. While models like nearest neighbors and non-negative +matrix factorization (NMF) offer high interpretability, their predictive +performance on supervised learning tasks is often limited. In contrast, neural +networks based on the multi-layer perceptron (MLP) support the modular +construction of expressive architectures and tend to have better recognition +accuracy but are often regarded as black boxes in terms of interpretability. +Our approach aims to strike a better balance between these two aspects through +the use of a building block based on NMF that incorporates supervised neural +network training methods to achieve high predictive performance while retaining +the desirable interpretability properties of NMF. We evaluate our Predictive +Factorized Coupling (PFC) block on small datasets and show that it achieves +competitive predictive performance with MLPs while also offering improved +interpretability. We demonstrate the benefits of this approach in various +scenarios, such as continual learning, training on non-i.i.d. data, and +knowledge removal after training. Additionally, we show examples of using the +PFC block to build more expressive architectures, including a fully-connected +residual network as well as a factorized recurrent neural network (RNN) that +performs competitively with vanilla RNNs while providing improved +interpretability. The PFC block uses an iterative inference algorithm that +converges to a fixed point, making it possible to trade off accuracy vs +computation after training but also currently preventing its use as a general +MLP replacement in some scenarios such as training on very large datasets. We +provide source code at https://github.com/bkvogel/pfc + +
+
+ comment: 42 pages, 13 figures +
+
+
+
+
+ + ☆ A Multi-Center Study on the Adaptability of a Shared Foundation Model + for Electronic Health Records + + +
+ Foundation models hold promise for transforming AI in healthcare by providing +modular components that are easily adaptable to downstream healthcare tasks, +making AI development more scalable and cost-effective. Structured EHR +foundation models, trained on coded medical records from millions of patients, +demonstrated benefits including increased performance with fewer training +labels, and improved robustness to distribution shifts. However, questions +remain on the feasibility of sharing these models across different hospitals +and their performance for local task adaptation. This multi-center study +examined the adaptability of a recently released structured EHR foundation +model ($FM_{SM}$), trained on longitudinal medical record data from 2.57M +Stanford Medicine patients. Experiments were conducted using EHR data at The +Hospital for Sick Children and MIMIC-IV. We assessed both adaptability via +continued pretraining on local data, and task adaptability compared to +baselines of training models from scratch at each site, including a local +foundation model. We evaluated the performance of these models on 8 clinical +prediction tasks. In both datasets, adapting the off-the-shelf $FM_{SM}$ +matched the performance of GBM models locally trained on all data while +providing a 13% improvement in settings with few task-specific training labels. +With continued pretraining on local data, label efficiency substantially +improved, such that $FM_{SM}$ required fewer than 1% of training examples to +match the fully trained GBM's performance. Continued pretraining was also 60 to +90% more sample-efficient than training local foundation models from scratch. +Our findings show that adapting shared EHR foundation models across hospitals +provides improved prediction performance at less cost, underscoring the utility +of base foundation models as modular components to streamline the development +of healthcare AI. + +
+
+ comment: 41 pages, 3 figures, 2 tables, 16 appendices +
+
+
+
+
+ + ☆ Gaussian Interpolation Flows + + +
+ Gaussian denoising has emerged as a powerful principle for constructing +simulation-free continuous normalizing flows for generative modeling. Despite +their empirical successes, theoretical properties of these flows and the +regularizing effect of Gaussian denoising have remained largely unexplored. In +this work, we aim to address this gap by investigating the well-posedness of +simulation-free continuous normalizing flows built on Gaussian denoising. +Through a unified framework termed Gaussian interpolation flow, we establish +the Lipschitz regularity of the flow velocity field, the existence and +uniqueness of the flow, and the Lipschitz continuity of the flow map and the +time-reversed flow map for several rich classes of target distributions. This +analysis also sheds light on the auto-encoding and cycle-consistency properties +of Gaussian interpolation flows. Additionally, we delve into the stability of +these flows in source distributions and perturbations of the velocity field, +using the quadratic Wasserstein distance as a metric. Our findings offer +valuable insights into the learning techniques employed in Gaussian +interpolation flows for generative modeling, providing a solid theoretical +foundation for end-to-end error analyses of learning GIFs with empirical +observations. + +
+
+ comment: 49 pages, 4 figures +
+
+
+
+
+ + ☆ CSGNN: Conquering Noisy Node labels via Dynamic Class-wise Selection + + +
+ Graph Neural Networks (GNNs) have emerged as a powerful tool for +representation learning on graphs, but they often suffer from overfitting and +label noise issues, especially when the data is scarce or imbalanced. Different +from the paradigm of previous methods that rely on single-node confidence, in +this paper, we introduce a novel Class-wise Selection for Graph Neural +Networks, dubbed CSGNN, which employs a neighbor-aggregated latent space to +adaptively select reliable nodes across different classes. Specifically, 1) to +tackle the class imbalance issue, we introduce a dynamic class-wise selection +mechanism, leveraging the clustering technique to identify clean nodes based on +the neighbor-aggregated confidences. In this way, our approach can avoid the +pitfalls of biased sampling which is common with global threshold techniques. +2) To alleviate the problem of noisy labels, built on the concept of the +memorization effect, CSGNN prioritizes learning from clean nodes before noisy +ones, thereby iteratively enhancing model performance while mitigating label +noise. Through extensive experiments, we demonstrate that CSGNN outperforms +state-of-the-art methods in terms of both effectiveness and robustness. + +
+
+
+
+
+ + ☆ Towards a Post-Market Monitoring Framework for Machine Learning-based + Medical Devices: A case study + + +
+ After a machine learning (ML)-based system is deployed in clinical practice, +performance monitoring is important to ensure the safety and effectiveness of +the algorithm over time. The goal of this work is to highlight the complexity +of designing a monitoring strategy and the need for a systematic framework that +compares the multitude of monitoring options. One of the main decisions is +choosing between using real-world (observational) versus interventional data. +Although the former is the most convenient source of monitoring data, it +exhibits well-known biases, such as confounding, selection, and missingness. In +fact, when the ML algorithm interacts with its environment, the algorithm +itself may be a primary source of bias. On the other hand, a carefully designed +interventional study that randomizes individuals can explicitly eliminate such +biases, but the ethics, feasibility, and cost of such an approach must be +carefully considered. Beyond the decision of the data source, monitoring +strategies vary in the performance criteria they track, the interpretability of +the test statistics, the strength of their assumptions, and their speed at +detecting performance decay. As a first step towards developing a framework +that compares the various monitoring options, we consider a case study of an +ML-based risk prediction algorithm for postoperative nausea and vomiting +(PONV). Bringing together tools from causal inference and statistical process +control, we walk through the basic steps of defining candidate monitoring +criteria, describing potential sources of bias and the causal model, and +specifying and comparing candidate monitoring procedures. We hypothesize that +these steps can be applied more generally, as causal inference can address +other sources of biases as well. + +
+
+
+
+
+ + ♻ ☆ Open-Ended Instructable Embodied Agents with Memory-Augmented Large + Language Models + + +
+ Pre-trained and frozen large language models (LLMs) can effectively map +simple scene rearrangement instructions to programs over a robot's visuomotor +functions through appropriate few-shot example prompting. To parse open-domain +natural language and adapt to a user's idiosyncratic procedures, not known +during prompt engineering time, fixed prompts fall short. In this paper, we +introduce HELPER, an embodied agent equipped with an external memory of +language-program pairs that parses free-form human-robot dialogue into action +programs through retrieval-augmented LLM prompting: relevant memories are +retrieved based on the current dialogue, instruction, correction, or VLM +description, and used as in-context prompt examples for LLM querying. The +memory is expanded during deployment to include pairs of user's language and +action plans, to assist future inferences and personalize them to the user's +language and routines. HELPER sets a new state-of-the-art in the TEACh +benchmark in both Execution from Dialog History (EDH) and Trajectory from +Dialogue (TfD), with a 1.7x improvement over the previous state-of-the-art for +TfD. Our models, code, and video results can be found in our project's website: +https://helper-agent-llm.github.io. + +
+
+ comment: Project page with code & videos: https://helper-agent-llm.github.io +
+
+
+
+
+ + ♻ ☆ ERUDITE: Human-in-the-Loop IoT for an Adaptive Personalized Learning + System + + +
+ Thanks to the rapid growth in wearable technologies and recent advancement in +machine learning and signal processing, monitoring complex human contexts +becomes feasible, paving the way to develop human-in-the-loop IoT systems that +naturally evolve to adapt to the human and environment state autonomously. +Nevertheless, a central challenge in designing many of these IoT systems arises +from the requirement to infer the human mental state, such as intention, +stress, cognition load, or learning ability. While different human contexts can +be inferred from the fusion of different sensor modalities that can correlate +to a particular mental state, the human brain provides a richer sensor modality +that gives us more insights into the required human context. This paper +proposes ERUDITE, a human-in-the-loop IoT system for the learning environment +that exploits recent wearable neurotechnology to decode brain signals. Through +insights from concept learning theory, ERUDITE can infer the human state of +learning and understand when human learning increases or declines. By +quantifying human learning as an input sensory signal, ERUDITE can provide +adequate personalized feedback to humans in a learning environment to enhance +their learning experience. ERUDITE is evaluated across $15$ participants and +showed that by using the brain signals as a sensor modality to infer the human +learning state and providing personalized adaptation to the learning +environment, the participants' learning performance increased on average by +$26\%$. Furthermore, we showed that ERUDITE can be deployed on an edge-based +prototype to evaluate its practicality and scalability. + +
+
+ comment: It is under review in the IEEE IoT journal +
+
+
+
+
+ + ♻ ☆ Learning Task Embeddings for Teamwork Adaptation in Multi-Agent + Reinforcement Learning NeurIPS 2023 + + +
+ Successful deployment of multi-agent reinforcement learning often requires +agents to adapt their behaviour. In this work, we discuss the problem of +teamwork adaptation in which a team of agents needs to adapt their policies to +solve novel tasks with limited fine-tuning. Motivated by the intuition that +agents need to be able to identify and distinguish tasks in order to adapt +their behaviour to the current task, we propose to learn multi-agent task +embeddings (MATE). These task embeddings are trained using an encoder-decoder +architecture optimised for reconstruction of the transition and reward +functions which uniquely identify tasks. We show that a team of agents is able +to adapt to novel tasks when provided with task embeddings. We propose three +MATE training paradigms: independent MATE, centralised MATE, and mixed MATE +which vary in the information used for the task encoding. We show that the +embeddings learned by MATE identify tasks and provide useful information which +agents leverage during adaptation to novel tasks. + +
+
+ comment: To be presented at the Seventh Workshop on Generalization in Planning + at the NeurIPS 2023 conference +
+
+
+
+
+ + ♻ ☆ Meta-Path Learning for Multi-relational Graph Neural Networks + + +
+ Existing multi-relational graph neural networks use one of two strategies for +identifying informative relations: either they reduce this problem to low-level +weight learning, or they rely on handcrafted chains of relational dependencies, +called meta-paths. However, the former approach faces challenges in the +presence of many relations (e.g., knowledge graphs), while the latter requires +substantial domain expertise to identify relevant meta-paths. In this work we +propose a novel approach to learn meta-paths and meta-path GNNs that are highly +accurate based on a small number of informative meta-paths. Key element of our +approach is a scoring function for measuring the potential informativeness of a +relation in the incremental construction of the meta-path. Our experimental +evaluation shows that the approach manages to correctly identify relevant +meta-paths even with a large number of relations, and substantially outperforms +existing multi-relational GNNs on synthetic and real-world experiments. + +
+
+
+
+
+ + ♻ ☆ Infinite Width Graph Neural Networks for Node Regression/ Classification + + +
+ This work analyzes Graph Neural Networks, a generalization of Fully-Connected +Deep Neural Nets on Graph structured data, when their width, that is the number +of nodes in each fullyconnected layer is increasing to infinity. Infinite Width +Neural Networks are connecting Deep Learning to Gaussian Processes and Kernels, +both Machine Learning Frameworks with long traditions and extensive theoretical +foundations. Gaussian Processes and Kernels have much less hyperparameters then +Neural Networks and can be used for uncertainty estimation, making them more +user friendly for applications. This works extends the increasing amount of +research connecting Gaussian Processes and Kernels to Neural Networks. The +Kernel and Gaussian Process closed forms are derived for a variety of +architectures, namely the standard Graph Neural Network, the Graph Neural +Network with Skip-Concatenate Connections and the Graph Attention Neural +Network. All architectures are evaluated on a variety of datasets on the task +of transductive Node Regression and Classification. Additionally, a Spectral +Sparsification method known as Effective Resistance is used to improve runtime +and memory requirements. Extending the setting to inductive graph learning +tasks (Graph Regression/ Classification) is straightforward and is briefly +discussed in 3.5. + +
+
+ comment: 49 Pages, 2 Figures (with subfigures), multiple tables, v2: made + table of contents fit to one page and added derivatives on GAT*NTK and GAT*GP + in A.4, v3: shorten parts of introduction and fixed typos, added numberings + to equations and discussion section, v4: fix two missing citations on page 10 +
+
+
+
+
+ + ♻ Let the Flows Tell: Solving Graph Combinatorial Optimization Problems + with GFlowNets NeurIPS 2023 + + +
+ Combinatorial optimization (CO) problems are often NP-hard and thus out of +reach for exact algorithms, making them a tempting domain to apply machine +learning methods. The highly structured constraints in these problems can +hinder either optimization or sampling directly in the solution space. On the +other hand, GFlowNets have recently emerged as a powerful machinery to +efficiently sample from composite unnormalized densities sequentially and have +the potential to amortize such solution-searching processes in CO, as well as +generate diverse solution candidates. In this paper, we design Markov decision +processes (MDPs) for different combinatorial problems and propose to train +conditional GFlowNets to sample from the solution space. Efficient training +techniques are also developed to benefit long-range credit assignment. Through +extensive experiments on a variety of different CO tasks with synthetic and +realistic data, we demonstrate that GFlowNet policies can efficiently find +high-quality solutions. Our implementation is open-sourced at +https://github.com/zdhNarsil/GFlowNet-CombOpt. + +
+
+ comment: Accepted by NeurIPS 2023 as spotlight +
+
+
+
+
+ + ♻ ☆ Sustainable Concrete via Bayesian Optimization NeurIPS 2023 + + +
+ Eight percent of global carbon dioxide emissions can be attributed to the +production of cement, the main component of concrete, which is also the +dominant source of CO2 emissions in the construction of data centers. The +discovery of lower-carbon concrete formulae is therefore of high significance +for sustainability. However, experimenting with new concrete formulae is time +consuming and labor intensive, as one usually has to wait to record the +concrete's 28-day compressive strength, a quantity whose measurement can by its +definition not be accelerated. This provides an opportunity for experimental +design methodology like Bayesian Optimization (BO) to accelerate the search for +strong and sustainable concrete formulae. Herein, we 1) propose modeling steps +that make concrete strength amenable to be predicted accurately by a Gaussian +process model with relatively few measurements, 2) formulate the search for +sustainable concrete as a multi-objective optimization problem, and 3) leverage +the proposed model to carry out multi-objective BO with real-world strength +measurements of the algorithmically proposed mixes. Our experimental results +show improved trade-offs between the mixtures' global warming potential (GWP) +and their associated compressive strengths, compared to mixes based on current +industry practices. Our methods are open-sourced at +github.com/facebookresearch/SustainableConcrete. + +
+
+ comment: NeurIPS 2023 Workshop on Adaptive Experimental Design and Active + Learning in the Real World +
+
+
+
+
+ + ♻ ☆ Balancing stability and plasticity in continual learning: the + readout-decomposition of activation change (RDAC) framework + + +
+ Continual learning (CL) algorithms strive to acquire new knowledge while +preserving prior information. However, this stability-plasticity trade-off +remains a central challenge. This paper introduces a framework that dissects +this trade-off, offering valuable insights into CL algorithms. The +Readout-Decomposition of Activation Change (RDAC) framework first addresses the +stability-plasticity dilemma and its relation to catastrophic forgetting. It +relates learning-induced activation changes in the range of prior readouts to +the degree of stability and changes in the null space to the degree of +plasticity. In deep non-linear networks tackling split-CIFAR-110 tasks, the +framework clarifies the stability-plasticity trade-offs of the popular +regularization algorithms Synaptic intelligence (SI), Elastic-weight +consolidation (EWC), and learning without Forgetting (LwF), and replay-based +algorithms Gradient episodic memory (GEM), and data replay. GEM and data replay +preserved stability and plasticity, while SI, EWC, and LwF traded off +plasticity for stability. The inability of the regularization algorithms to +maintain plasticity was linked to them restricting the change of activations in +the null space of the prior readout. Additionally, for one-hidden-layer linear +neural networks, we derived a gradient decomposition algorithm to restrict +activation change only in the range of the prior readouts, to maintain high +stability while not further sacrificing plasticity. Results demonstrate that +the algorithm maintained stability without significant plasticity loss. The +RDAC framework informs the behavior of existing CL algorithms and paves the way +for novel CL approaches. Finally, it sheds light on the connection between +learning-induced activation/representation changes and the stability-plasticity +dilemma, also offering insights into representational drift in biological +systems. + +
+
+ comment: 15 pages, 5 figures, Revision +
+
+
+
+
+ + ♻ ☆ JaxMARL: Multi-Agent RL Environments in JAX + + +
+ Benchmarks play an important role in the development of machine learning +algorithms. For example, research in reinforcement learning (RL) has been +heavily influenced by available environments and benchmarks. However, RL +environments are traditionally run on the CPU, limiting their scalability with +typical academic compute. Recent advancements in JAX have enabled the wider use +of hardware acceleration to overcome these computational hurdles, enabling +massively parallel RL training pipelines and environments. This is particularly +useful for multi-agent reinforcement learning (MARL) research. First of all, +multiple agents must be considered at each environment step, adding +computational burden, and secondly, the sample complexity is increased due to +non-stationarity, decentralised partial observability, or other MARL +challenges. In this paper, we present JaxMARL, the first open-source code base +that combines ease-of-use with GPU enabled efficiency, and supports a large +number of commonly used MARL environments as well as popular baseline +algorithms. When considering wall clock time, our experiments show that per-run +our JAX-based training pipeline is up to 12500x faster than existing +approaches. This enables efficient and thorough evaluations, with the potential +to alleviate the evaluation crisis of the field. We also introduce and +benchmark SMAX, a vectorised, simplified version of the popular StarCraft +Multi-Agent Challenge, which removes the need to run the StarCraft II game +engine. This not only enables GPU acceleration, but also provides a more +flexible MARL environment, unlocking the potential for self-play, +meta-learning, and other future applications in MARL. We provide code at +https://github.com/flairox/jaxmarl. + +
+
+
+
+
+ + ♻ ☆ Reward Teaching for Federated Multi-armed Bandits + + +
+ Most of the existing federated multi-armed bandits (FMAB) designs are based +on the presumption that clients will implement the specified design to +collaborate with the server. In reality, however, it may not be possible to +modify the clients' existing protocols. To address this challenge, this work +focuses on clients who always maximize their individual cumulative rewards, and +introduces a novel idea of ``reward teaching'', where the server guides the +clients towards global optimality through implicit local reward adjustments. +Under this framework, the server faces two tightly coupled tasks of bandit +learning and target teaching, whose combination is non-trivial and challenging. +A phased approach, called Teaching-After-Learning (TAL), is first designed to +encourage and discourage clients' explorations separately. General performance +analyses of TAL are established when the clients' strategies satisfy certain +mild requirements. With novel technical approaches developed to analyze the +warm-start behaviors of bandit algorithms, particularized guarantees of TAL +with clients running UCB or epsilon-greedy strategies are then obtained. These +results demonstrate that TAL achieves logarithmic regrets while only incurring +logarithmic adjustment costs, which is order-optimal w.r.t. a natural lower +bound. As a further extension, the Teaching-While-Learning (TWL) algorithm is +developed with the idea of successive arm elimination to break the non-adaptive +phase separation in TAL. Rigorous analyses demonstrate that when facing clients +with UCB1, TWL outperforms TAL in terms of the dependencies on sub-optimality +gaps thanks to its adaptive design. Experimental results demonstrate the +effectiveness and generality of the proposed algorithms. + +
+
+ comment: Accepted to IEEE Transactions on Signal Processing +
+
+
+
+
+ + ♻ ☆ SURF: A Generalization Benchmark for GNNs Predicting Fluid Dynamics + + +
+ Simulating fluid dynamics is crucial for the design and development process, +ranging from simple valves to complex turbomachinery. Accurately solving the +underlying physical equations is computationally expensive. Therefore, +learning-based solvers that model interactions on meshes have gained interest +due to their promising speed-ups. However, it is unknown to what extent these +models truly understand the underlying physical principles and can generalize +rather than interpolate. Generalization is a key requirement for a +general-purpose fluid simulator, which should adapt to different topologies, +resolutions, or thermodynamic ranges. We propose SURF, a benchmark designed to +test the $\textit{generalization}$ of learned graph-based fluid simulators. +SURF comprises individual datasets and provides specific performance and +generalization metrics for evaluating and comparing different models. We +empirically demonstrate the applicability of SURF by thoroughly investigating +the two state-of-the-art graph-based models, yielding new insights into their +generalization. + +
+
+ comment: Accepted at LoG 2023, Learning on Graphs Conference +
+
+
+
+
+ + ♻ ☆ Towards a Transportable Causal Network Model Based on Observational + Healthcare Data + + +
+ Over the last decades, many prognostic models based on artificial +intelligence techniques have been used to provide detailed predictions in +healthcare. Unfortunately, the real-world observational data used to train and +validate these models are almost always affected by biases that can strongly +impact the outcomes validity: two examples are values missing not-at-random and +selection bias. Addressing them is a key element in achieving transportability +and in studying the causal relationships that are critical in clinical decision +making, going beyond simpler statistical approaches based on probabilistic +association. + In this context, we propose a novel approach that combines selection +diagrams, missingness graphs, causal discovery and prior knowledge into a +single graphical model to estimate the cardiovascular risk of adolescent and +young females who survived breast cancer. We learn this model from data +comprising two different cohorts of patients. The resulting causal network +model is validated by expert clinicians in terms of risk assessment, accuracy +and explainability, and provides a prognostic model that outperforms competing +machine learning methods. + +
+
+
+
+
+ + ♻ ☆ Structural Node Embeddings with Homomorphism Counts + + +
+ Graph homomorphism counts, first explored by Lov\'asz in 1967, have recently +garnered interest as a powerful tool in graph-based machine learning. Grohe +(PODS 2020) proposed the theoretical foundations for using homomorphism counts +in machine learning on graph level as well as node level tasks. By their very +nature, these capture local structural information, which enables the creation +of robust structural embeddings. While a first approach for graph level tasks +has been made by Nguyen and Maehara (ICML 2020), we experimentally show the +effectiveness of homomorphism count based node embeddings. Enriched with node +labels, node weights, and edge weights, these offer an interpretable +representation of graph data, allowing for enhanced explainability of machine +learning models. + We propose a theoretical framework for isomorphism-invariant homomorphism +count based embeddings which lend themselves to a wide variety of downstream +tasks. Our approach capitalises on the efficient computability of graph +homomorphism counts for bounded treewidth graph classes, rendering it a +practical solution for real-world applications. We demonstrate their +expressivity through experiments on benchmark datasets. Although our results do +not match the accuracy of state-of-the-art neural architectures, they are +comparable to other advanced graph learning models. Remarkably, our approach +demarcates itself by ensuring explainability for each individual feature. By +integrating interpretable machine learning algorithms like SVMs or Random +Forests, we establish a seamless, end-to-end explainable pipeline. Our study +contributes to the advancement of graph-based techniques that offer both +performance and interpretability. + +
+
+
+
+
+ + ♻ ☆ SynthEnsemble: A Fusion of CNN, Vision Transformer, and Hybrid Models + for Multi-Label Chest X-Ray Classification + + +
+ Chest X-rays are widely used to diagnose thoracic diseases, but the lack of +detailed information about these abnormalities makes it challenging to develop +accurate automated diagnosis systems, which is crucial for early detection and +effective treatment. To address this challenge, we employed deep learning +techniques to identify patterns in chest X-rays that correspond to different +diseases. We conducted experiments on the "ChestX-ray14" dataset using various +pre-trained CNNs, transformers, hybrid(CNN+Transformer) models and classical +models. The best individual model was the CoAtNet, which achieved an area under +the receiver operating characteristic curve (AUROC) of 84.2%. By combining the +predictions of all trained models using a weighted average ensemble where the +weight of each model was determined using differential evolution, we further +improved the AUROC to 85.4%, outperforming other state-of-the-art methods in +this field. Our findings demonstrate the potential of deep learning techniques, +particularly ensemble deep learning, for improving the accuracy of automatic +diagnosis of thoracic diseases from chest X-rays. + +
+
+ comment: Accepted in International Conference on Computer and Information + Technology (ICCIT) 2023 +
+
+
+
+
+ + ♻ ☆ Lag-Llama: Towards Foundation Models for Time Series Forecasting NeurIPS 2023 + + +
+ Aiming to build foundation models for time-series forecasting and study their +scaling behavior, we present here our work-in-progress on Lag-Llama, a +general-purpose univariate probabilistic time-series forecasting model trained +on a large collection of time-series data. The model shows good zero-shot +prediction capabilities on unseen "out-of-distribution" time-series datasets, +outperforming supervised baselines. We use smoothly broken power-laws to fit +and predict model scaling behavior. The open source code is made available at +https://github.com/kashif/pytorch-transformer-ts. + +
+
+ comment: Preliminary Draft. Accepted at NeurIPS 2023 R0-FoMo Workshop. Full + paper coming soon with comprehensive results and open-source model + checkpoints +
+
+
+
+
+ + ♻ ☆ Channel and Gradient-Importance Aware Device Scheduling for Over-the-Air + Federated Learning + + +
+ Federated learning (FL) is a popular privacy-preserving distributed training +scheme, where multiple devices collaborate to train machine learning models by +uploading local model updates. To improve communication efficiency, +over-the-air computation (AirComp) has been applied to FL, which leverages +analog modulation to harness the superposition property of radio waves such +that numerous devices can upload their model updates concurrently for +aggregation. However, the uplink channel noise incurs considerable model +aggregation distortion, which is critically determined by the device scheduling +and compromises the learned model performance. In this paper, we propose a +probabilistic device scheduling framework for over-the-air FL, named PO-FL, to +mitigate the negative impact of channel noise, where each device is scheduled +according to a certain probability and its model update is reweighted using +this probability in aggregation. We prove the unbiasedness of this aggregation +scheme and demonstrate the convergence of PO-FL on both convex and non-convex +loss functions. Our convergence bounds unveil that the device scheduling +affects the learning performance through the communication distortion and +global update variance. Based on the convergence analysis, we further develop a +channel and gradient-importance aware algorithm to optimize the device +scheduling probabilities in PO-FL. Extensive simulation results show that the +proposed PO-FL framework with channel and gradient-importance awareness +achieves faster convergence and produces better models than baseline methods. + +
+
+
+
+
+ + ♻ ☆ Text-to-SQL Empowered by Large Language Models: A Benchmark Evaluation + + +
+ Large language models (LLMs) have emerged as a new paradigm for Text-to-SQL +task. However, the absence of a systematical benchmark inhibits the development +of designing effective, efficient and economic LLM-based Text-to-SQL solutions. +To address this challenge, in this paper, we first conduct a systematical and +extensive comparison over existing prompt engineering methods, including +question representation, example selection and example organization, and with +these experimental results, we elaborate their pros and cons. Based on these +findings, we propose a new integrated solution, named DAIL-SQL, which refreshes +the Spider leaderboard with 86.6% execution accuracy and sets a new bar. To +explore the potential of open-source LLM, we investigate them in various +scenarios, and further enhance their performance with supervised fine-tuning. +Our explorations highlight open-source LLMs' potential in Text-to-SQL, as well +as the advantages and disadvantages of the supervised fine-tuning. +Additionally, towards an efficient and economic LLM-based Text-to-SQL solution, +we emphasize the token efficiency in prompt engineering and compare the prior +studies under this metric. We hope that our work provides a deeper +understanding of Text-to-SQL with LLMs, and inspires further investigations and +broad applications. + +
+
+ comment: We have released code on https://github.com/BeachWang/DAIL-SQL +
+
+
+
+
+ + ♻ ☆ A Variational Autoencoder for Heterogeneous Temporal and Longitudinal + Data + + +
+ The variational autoencoder (VAE) is a popular deep latent variable model +used to analyse high-dimensional datasets by learning a low-dimensional latent +representation of the data. It simultaneously learns a generative model and an +inference network to perform approximate posterior inference. Recently proposed +extensions to VAEs that can handle temporal and longitudinal data have +applications in healthcare, behavioural modelling, and predictive maintenance. +However, these extensions do not account for heterogeneous data (i.e., data +comprising of continuous and discrete attributes), which is common in many +real-life applications. In this work, we propose the heterogeneous longitudinal +VAE (HL-VAE) that extends the existing temporal and longitudinal VAEs to +heterogeneous data. HL-VAE provides efficient inference for high-dimensional +datasets and includes likelihood models for continuous, count, categorical, and +ordinal data while accounting for missing observations. We demonstrate our +model's efficacy through simulated as well as clinical datasets, and show that +our proposed model achieves competitive performance in missing value imputation +and predictive accuracy. + +
+
+
+
+
+ + ♻ ☆ HEALNet -- Hybrid Multi-Modal Fusion for Heterogeneous Biomedical Data + + +
+ Technological advances in medical data collection such as high-resolution +histopathology and high-throughput genomic sequencing have contributed to the +rising requirement for multi-modal biomedical modelling, specifically for +image, tabular, and graph data. Most multi-modal deep learning approaches use +modality-specific architectures that are trained separately and cannot capture +the crucial cross-modal information that motivates the integration of different +data sources. This paper presents the Hybrid Early-fusion Attention Learning +Network (HEALNet): a flexible multi-modal fusion architecture, which a) +preserves modality-specific structural information, b) captures the cross-modal +interactions and structural information in a shared latent space, c) can +effectively handle missing modalities during training and inference, and d) +enables intuitive model inspection by learning on the raw data input instead of +opaque embeddings. We conduct multi-modal survival analysis on Whole Slide +Images and Multi-omic data on four cancer cohorts of The Cancer Genome Atlas +(TCGA). HEALNet achieves state-of-the-art performance, substantially improving +over both uni-modal and recent multi-modal baselines, whilst being robust in +scenarios with missing modalities. + +
+
+ comment: 7 pages body, 5 pages appendix +
+
+
+
+
+ + ♻ ☆ Efficient learning of nonlinear prediction models with time-series + privileged information + + +
+ In domains where sample sizes are limited, efficient learning algorithms are +critical. Learning using privileged information (LuPI) offers increased sample +efficiency by allowing prediction models access to auxiliary information at +training time which is unavailable when the models are used. In recent work, it +was shown that for prediction in linear-Gaussian dynamical systems, a LuPI +learner with access to intermediate time series data is never worse and often +better in expectation than any unbiased classical learner. We provide new +insights into this analysis and generalize it to nonlinear prediction tasks in +latent dynamical systems, extending theoretical guarantees to the case where +the map connecting latent variables and observations is known up to a linear +transform. In addition, we propose algorithms based on random features and +representation learning for the case when this map is unknown. A suite of +empirical results confirm theoretical findings and show the potential of using +privileged time-series information in nonlinear prediction. + +
+
+
+
+
+ + ♻ ☆ timeXplain -- A Framework for Explaining the Predictions of Time Series + Classifiers + + +
+ Modern time series classifiers display impressive predictive capabilities, +yet their decision-making processes mostly remain black boxes to the user. At +the same time, model-agnostic explainers, such as the recently proposed SHAP, +promise to make the predictions of machine learning models interpretable, +provided there are well-designed domain mappings. We bring both worlds together +in our timeXplain framework, extending the reach of explainable artificial +intelligence to time series classification and value prediction. We present +novel domain mappings for the time domain, frequency domain, and time series +statistics and analyze their explicative power as well as their limits. We +employ a novel evaluation metric to experimentally compare timeXplain to +several model-specific explanation approaches for state-of-the-art time series +classifiers. + +
+
+ comment: 9 pages; published code, added combined time slice and frequency band + mapping, added quantitative evaluation and comparison to model-specific + explainers +
+
+
+
+
+ + ♻ ☆ High-performance deep spiking neural networks with 0.3 spikes per neuron + + +
+ Communication by rare, binary spikes is a key factor for the energy +efficiency of biological brains. However, it is harder to train +biologically-inspired spiking neural networks (SNNs) than artificial neural +networks (ANNs). This is puzzling given that theoretical results provide exact +mapping algorithms from ANNs to SNNs with time-to-first-spike (TTFS) coding. In +this paper we analyze in theory and simulation the learning dynamics of +TTFS-networks and identify a specific instance of the vanishing-or-exploding +gradient problem. While two choices of SNN mappings solve this problem at +initialization, only the one with a constant slope of the neuron membrane +potential at threshold guarantees the equivalence of the training trajectory +between SNNs and ANNs with rectified linear units. We demonstrate that training +deep SNN models achieves the exact same performance as that of ANNs, surpassing +previous SNNs on image classification datasets such as MNIST/Fashion-MNIST, +CIFAR10/CIFAR100 and PLACES365. Our SNN accomplishes high-performance +classification with less than 0.3 spikes per neuron, lending itself for an +energy-efficient implementation. We show that fine-tuning SNNs with our robust +gradient descent algorithm enables their optimization for hardware +implementations with low latency and resilience to noise and quantization. + +
+
+
+
+
+ + ♻ ☆ Handling Overlapping Asymmetric Datasets -- A Twice Penalized P-Spline + Approach + + +
+ Overlapping asymmetric datasets are common in data science and pose questions +of how they can be incorporated together into a predictive analysis. In +healthcare datasets there is often a small amount of information that is +available for a larger number of patients such as an electronic health record, +however a small number of patients may have had extensive further testing. +Common solutions such as missing imputation can often be unwise if the smaller +cohort is significantly different in scale to the larger sample, therefore the +aim of this research is to develop a new method which can model the smaller +cohort against a particular response, whilst considering the larger cohort +also. Motivated by non-parametric models, and specifically flexible smoothing +techniques via generalized additive models, we model a twice penalized P-Spline +approximation method to firstly prevent over/under-fitting of the smaller +cohort and secondly to consider the larger cohort. This second penalty is +created through discrepancies in the marginal value of covariates that exist in +both the smaller and larger cohorts. Through data simulations, parameter +tunings and model adaptations to consider a continuous and binary response, we +find our twice penalized approach offers an enhanced fit over a linear B-Spline +and once penalized P-Spline approximation. Applying to a real-life dataset +relating to a person's risk of developing Non-Alcoholic Steatohepatitis, we see +an improved model fit performance of over 65%. Areas for future work within +this space include adapting our method to not require dimensionality reduction +and also consider parametric modelling methods. However, to our knowledge this +is the first work to propose additional marginal penalties in a flexible +regression of which we can report a vastly improved model fit that is able to +consider asymmetric datasets, without the need for missing data imputation. + +
+
+ comment: 52 pages, 17 figures, 8 tables, 34 references +
+
+
+
+
+ + ♻ ☆ A novel approach to measuring patent claim scope based on probabilities + obtained from (large) language models + + +
+ This work proposes to measure the scope of a patent claim as the reciprocal +of the self-information contained in this claim. A probability of occurrence of +the claim is obtained from a language model and this probability is used to +compute the self-information. Grounded in information theory, this approach is +based on the assumption that an unlikely concept is more informative than a +usual concept, insofar as it is more surprising. In turn, the more surprising +the information required to defined the claim, the narrower its scope. Five +language models are considered, ranging from simplest models (each word or +character is assigned an identical probability) to intermediate models (using +average word or character frequencies), to a large language model (GPT2). +Interestingly, the scope resulting from the simplest language models is +proportional to the reciprocal of the number of words or characters involved in +the claim, a metric already used in previous works. Application is made to +multiple series of patent claims directed to distinct inventions, where each +series consists of claims devised to have a gradually decreasing scope. The +performance of the language models is assessed with respect to several ad hoc +tests. The more sophisticated the model, the better the results. I.e., the GPT2 +probability model outperforms models based on word and character frequencies, +which themselves outdo the simplest models based on word or character counts. +Still, the character count appears to be a more reliable indicator than the +word count. + +
+
+ comment: 58 pages, 8 tables, 6 figures. Substantial changes made to version 2: + New section 4.1 added (including a new table); Minor normalization issue + corrected in values listed in Appendix B; Content of former appendix C now + moved to Section 3; and new Appendix C added. Minor changes made to version 3 + (style, typos, language) +
+
+
+
+
+ + ♻ ☆ Multi Time Scale World Models NeurIPS 2023 + + +
+ Intelligent agents use internal world models to reason and make predictions +about different courses of their actions at many scales. Devising learning +paradigms and architectures that allow machines to learn world models that +operate at multiple levels of temporal abstractions while dealing with complex +uncertainty predictions is a major technical hurdle. In this work, we propose a +probabilistic formalism to learn multi-time scale world models which we call +the Multi Time Scale State Space (MTS3) model. Our model uses a computationally +efficient inference scheme on multiple time scales for highly accurate +long-horizon predictions and uncertainty estimates over several seconds into +the future. Our experiments, which focus on action conditional long horizon +future predictions, show that MTS3 outperforms recent methods on several system +identification benchmarks including complex simulated and real-world dynamical +systems. + +
+
+ comment: Accepted as spotlight at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Scattering Vision Transformer: Spectral Mixing Matters NeurIPS 2023 + + +
+ Vision transformers have gained significant attention and achieved +state-of-the-art performance in various computer vision tasks, including image +classification, instance segmentation, and object detection. However, +challenges remain in addressing attention complexity and effectively capturing +fine-grained information within images. Existing solutions often resort to +down-sampling operations, such as pooling, to reduce computational cost. +Unfortunately, such operations are non-invertible and can result in information +loss. In this paper, we present a novel approach called Scattering Vision +Transformer (SVT) to tackle these challenges. SVT incorporates a spectrally +scattering network that enables the capture of intricate image details. SVT +overcomes the invertibility issue associated with down-sampling operations by +separating low-frequency and high-frequency components. Furthermore, SVT +introduces a unique spectral gating network utilizing Einstein multiplication +for token and channel mixing, effectively reducing complexity. We show that SVT +achieves state-of-the-art performance on the ImageNet dataset with a +significant reduction in a number of parameters and FLOPS. SVT shows 2\% +improvement over LiTv2 and iFormer. SVT-H-S reaches 84.2\% top-1 accuracy, +while SVT-H-B reaches 85.2\% (state-of-art for base versions) and SVT-H-L +reaches 85.7\% (again state-of-art for large versions). SVT also shows +comparable results in other vision tasks such as instance segmentation. SVT +also outperforms other transformers in transfer learning on standard datasets +such as CIFAR10, CIFAR100, Oxford Flower, and Stanford Car datasets. The +project page is available on this +webpage.\url{https://badripatro.github.io/svt/}. + +
+
+ comment: Accepted @NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ SALSA-CLRS: A Sparse and Scalable Benchmark for Algorithmic Reasoning + + +
+ We introduce an extension to the CLRS algorithmic learning benchmark, +prioritizing scalability and the utilization of sparse representations. Many +algorithms in CLRS require global memory or information exchange, mirrored in +its execution model, which constructs fully connected (not sparse) graphs based +on the underlying problem. Despite CLRS's aim of assessing how effectively +learned algorithms can generalize to larger instances, the existing execution +model becomes a significant constraint due to its demanding memory requirements +and runtime (hard to scale). However, many important algorithms do not demand a +fully connected graph; these algorithms, primarily distributed in nature, align +closely with the message-passing paradigm employed by Graph Neural Networks. +Hence, we propose SALSA-CLRS, an extension of the current CLRS benchmark +specifically with scalability and sparseness in mind. Our approach includes +adapted algorithms from the original CLRS benchmark and introduces new problems +from distributed and randomized algorithms. Moreover, we perform a thorough +empirical evaluation of our benchmark. Code is publicly available at +https://github.com/jkminder/SALSA-CLRS. + +
+
+ comment: (Extended Abstract) Presented at the Second Learning on Graphs + Conference (LoG 2023) +
+
+
+
+
+ + ♻ ☆ Adversarial Examples Are Not Real Features NeurIPS 2023 + + +
+ The existence of adversarial examples has been a mystery for years and +attracted much interest. A well-known theory by \citet{ilyas2019adversarial} +explains adversarial vulnerability from a data perspective by showing that one +can extract non-robust features from adversarial examples and these features +alone are useful for classification. However, the explanation remains quite +counter-intuitive since non-robust features are mostly noise features to +humans. In this paper, we re-examine the theory from a larger context by +incorporating multiple learning paradigms. Notably, we find that contrary to +their good usefulness under supervised learning, non-robust features attain +poor usefulness when transferred to other self-supervised learning paradigms, +such as contrastive learning, masked image modeling, and diffusion models. It +reveals that non-robust features are not really as useful as robust or natural +features that enjoy good transferability between these paradigms. Meanwhile, +for robustness, we also show that naturally trained encoders from robust +features are largely non-robust under AutoAttack. Our cross-paradigm +examination suggests that the non-robust features are not really useful but +more like paradigm-wise shortcuts, and robust features alone might be +insufficient to attain reliable model robustness. Code is available at +\url{https://github.com/PKU-ML/AdvNotRealFeatures}. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ A benchmark of categorical encoders for binary classification NeurIPS 2023 + + +
+ Categorical encoders transform categorical features into numerical +representations that are indispensable for a wide range of machine learning +models. Existing encoder benchmark studies lack generalizability because of +their limited choice of (1) encoders, (2) experimental factors, and (3) +datasets. Additionally, inconsistencies arise from the adoption of varying +aggregation strategies. This paper is the most comprehensive benchmark of +categorical encoders to date, including an extensive evaluation of 32 +configurations of encoders from diverse families, with 36 combinations of +experimental factors, and on 50 datasets. The study shows the profound +influence of dataset selection, experimental factors, and aggregation +strategies on the benchmark's conclusions -- aspects disregarded in previous +encoder benchmarks. + +
+
+ comment: To be published in the 37th Conference on Neural Information + Processing Systems (NeurIPS 2023) Track on Datasets and Benchmarks +
+
+
+
+
+ + ♻ ☆ Knowledge Augmented Machine Learning with Applications in Autonomous + Driving: A Survey + + +
+ The availability of representative datasets is an essential prerequisite for +many successful artificial intelligence and machine learning models. However, +in real life applications these models often encounter scenarios that are +inadequately represented in the data used for training. There are various +reasons for the absence of sufficient data, ranging from time and cost +constraints to ethical considerations. As a consequence, the reliable usage of +these models, especially in safety-critical applications, is still a tremendous +challenge. Leveraging additional, already existing sources of knowledge is key +to overcome the limitations of purely data-driven approaches. Knowledge +augmented machine learning approaches offer the possibility of compensating for +deficiencies, errors, or ambiguities in the data, thus increasing the +generalization capability of the applied models. Even more, predictions that +conform with knowledge are crucial for making trustworthy and safe decisions +even in underrepresented scenarios. This work provides an overview of existing +techniques and methods in the literature that combine data-driven models with +existing knowledge. The identified approaches are structured according to the +categories knowledge integration, extraction and conformity. In particular, we +address the application of the presented methods in the field of autonomous +driving. + +
+
+ comment: 111 pages, Added section on Run-time Network Verification +
+
+
+
+
+ + ♻ ☆ Attribution Patching Outperforms Automated Circuit Discovery NeurIPS 2023 + + +
+ Automated interpretability research has recently attracted attention as a +potential research direction that could scale explanations of neural network +behavior to large models. Existing automated circuit discovery work applies +activation patching to identify subnetworks responsible for solving specific +tasks (circuits). In this work, we show that a simple method based on +attribution patching outperforms all existing methods while requiring just two +forward passes and a backward pass. We apply a linear approximation to +activation patching to estimate the importance of each edge in the +computational subgraph. Using this approximation, we prune the least important +edges of the network. We survey the performance and limitations of this method, +finding that averaged over all tasks our method has greater AUC from circuit +recovery than other methods. + +
+
+ comment: 6 main paper pages, 6 additional pages. NeurIPS 2023 ATTRIB Workshop +
+
+
+
+
+ + ♻ ☆ Stable Linear Subspace Identification: A Machine Learning Approach + + +
+ Machine Learning (ML) and linear System Identification (SI) have been +historically developed independently. In this paper, we leverage +well-established ML tools - especially the automatic differentiation framework +- to introduce SIMBa, a family of discrete linear multi-step-ahead state-space +SI methods using backpropagation. SIMBa relies on a novel +Linear-Matrix-Inequality-based free parametrization of Schur matrices to ensure +the stability of the identified model. + We show how SIMBa generally outperforms traditional linear state-space SI +methods, and sometimes significantly, although at the price of a higher +computational burden. This performance gap is particularly remarkable compared +to other SI methods with stability guarantees, where the gain is frequently +above 25% in our investigations, hinting at SIMBa's ability to simultaneously +achieve state-of-the-art fitting performance and enforce stability. +Interestingly, these observations hold for a wide variety of input-output +systems and on both simulated and real-world data, showcasing the flexibility +of the proposed approach. We postulate that this new SI paradigm presents a +great extension potential to identify structured nonlinear models from data, +and we hence open-source SIMBa on https://github.com/Cemempamoi/simba. + +
+
+ comment: Submitted to ECC 2024 +
+
+
+
+
+ + ♻ ☆ Relationship between Batch Size and Number of Steps Needed for Nonconvex + Optimization of Stochastic Gradient Descent using Armijo Line Search + + +
+ Stochastic gradient descent (SGD) is the simplest deep learning optimizer +with which to train deep neural networks. While SGD can use various learning +rates, such as constant or diminishing rates, the previous numerical results +showed that SGD performs better than other deep learning optimizers using when +it uses learning rates given by line search methods. In this paper, we perform +a convergence analysis on SGD with a learning rate given by an Armijo line +search for nonconvex optimization. The analysis indicates that the upper bound +of the expectation of the squared norm of the full gradient becomes small when +the number of steps and the batch size are large. Next, we show that, for SGD +with the Armijo-line-search learning rate, the number of steps needed for +nonconvex optimization is a monotone decreasing convex function of the batch +size; that is, the number of steps needed for nonconvex optimization decreases +as the batch size increases. Furthermore, we show that the stochastic +first-order oracle (SFO) complexity, which is the stochastic gradient +computation cost, is a convex function of the batch size; that is, there exists +a critical batch size that minimizes the SFO complexity. Finally, we provide +numerical results that support our theoretical results. The numerical results +indicate that the number of steps needed for training deep neural networks +decreases as the batch size increases and that there exist the critical batch +sizes that can be estimated from the theoretical results. + +
+
+
+
+
+ + ♻ ☆ Towards Hierarchical Regional Transformer-based Multiple Instance + Learning + + +
+ The classification of gigapixel histopathology images with deep multiple +instance learning models has become a critical task in digital pathology and +precision medicine. In this work, we propose a Transformer-based multiple +instance learning approach that replaces the traditional learned attention +mechanism with a regional, Vision Transformer inspired self-attention +mechanism. We present a method that fuses regional patch information to derive +slide-level predictions and show how this regional aggregation can be stacked +to hierarchically process features on different distance levels. To increase +predictive accuracy, especially for datasets with small, local morphological +features, we introduce a method to focus the image processing on high attention +regions during inference. Our approach is able to significantly improve +performance over the baseline on two histopathology datasets and points towards +promising directions for further research. + +
+
+ comment: 8 pages, LaTeX; header update after published, fixed typos +
+
+
+
+
+ + ♻ ☆ Role Taxonomy of Units in Deep Neural Networks + + +
+ Identifying the role of network units in deep neural networks (DNNs) is +critical in many aspects including giving understandings on the mechanisms of +DNNs and building basic connections between deep learning and neuroscience. +However, there remains unclear on which roles the units in DNNs with different +generalization ability could present. To this end, we give role taxonomy of +units in DNNs via introducing the retrieval-of-function test, where units are +categorized into four types in terms of their functional preference on +separately the training set and testing set. We show that ratios of the four +categories are highly associated with the generalization ability of DNNs from +two distinct perspectives, based on which we give signs of DNNs with well +generalization. + +
+
+
+
+
+ + ♻ ☆ Geometric Algebra Transformer NeurIPS 2023 + + +
+ Problems involving geometric data arise in physics, chemistry, robotics, +computer vision, and many other fields. Such data can take numerous forms, for +instance points, direction vectors, translations, or rotations, but to date +there is no single architecture that can be applied to such a wide variety of +geometric types while respecting their symmetries. In this paper we introduce +the Geometric Algebra Transformer (GATr), a general-purpose architecture for +geometric data. GATr represents inputs, outputs, and hidden states in the +projective geometric (or Clifford) algebra, which offers an efficient +16-dimensional vector-space representation of common geometric objects as well +as operators acting on them. GATr is equivariant with respect to E(3), the +symmetry group of 3D Euclidean space. As a Transformer, GATr is versatile, +efficient, and scalable. We demonstrate GATr in problems from n-body modeling +to wall-shear-stress estimation on large arterial meshes to robotic motion +planning. GATr consistently outperforms both non-geometric and equivariant +baselines in terms of error, data efficiency, and scalability. + +
+
+ comment: Published at NeurIPS 2023, implementation available at + https://github.com/qualcomm-ai-research/geometric-algebra-transformer . v3: + matches camera-ready version +
+
+
+
+
+ + ♻ ☆ Online Arbitrary Shaped Clustering through Correlated Gaussian Functions + + +
+ There is no convincing evidence that backpropagation is a biologically +plausible mechanism, and further studies of alternative learning methods are +needed. A novel online clustering algorithm is presented that can produce +arbitrary shaped clusters from inputs in an unsupervised manner, and requires +no prior knowledge of the number of clusters in the input data. This is +achieved by finding correlated outputs from functions that capture commonly +occurring input patterns. The algorithm can be deemed more biologically +plausible than model optimization through backpropagation, although practical +applicability may require additional research. However, the method yields +satisfactory results on several toy datasets on a noteworthy range of +hyperparameters. + +
+
+ comment: Corrected uniform distribution range; removed "average" from last + sentence in section 4 +
+
+
+
+
+ + ♻ ☆ oneDNN Graph Compiler: A Hybrid Approach for High-Performance Deep + Learning Compilation + + +
+ With the rapid development of deep learning models and hardware support for +dense computing, the deep learning workload characteristics changed +significantly from a few hot spots on compute-intensive operations to a broad +range of operations scattered across the models. Accelerating a few +compute-intensive operations using the expert-tuned implementation of +primitives does not fully exploit the performance potential of AI hardware. +Various efforts have been made to compile a full deep neural network (DNN) +graph. One of the biggest challenges is to achieve high-performance tensor +compilation by generating expert level performance code for the dense +compute-intensive operations and applying compilation optimization at the scope +of DNN computation graph across multiple compute-intensive operations. + We present oneDNN Graph Compiler, a tensor compiler that employs a hybrid +approach of using techniques from both compiler optimization and expert-tuned +kernels for high performance code generation of the deep neural network graph. +oneDNN Graph Compiler addresses unique optimization challenges in the deep +learning domain, such as low-precision computation, aggressive fusion of graph +operations, optimization for static tensor shapes and memory layout, constant +weight optimization, and memory buffer reuse. Experimental results demonstrate +significant performance gains over existing tensor compiler and primitives +library for performance-critical DNN computation graphs and end-to-end models +on Intel Xeon Scalable Processors. + +
+
+ comment: 10 pages excluding reference, 9 figures, 1 table +
+
+
+
+
+ + ♻ ☆ Enhancing Robust Representation in Adversarial Training: Alignment and + Exclusion Criteria + + +
+ Deep neural networks are vulnerable to adversarial noise. Adversarial +Training (AT) has been demonstrated to be the most effective defense strategy +to protect neural networks from being fooled. However, we find AT omits to +learning robust features, resulting in poor performance of adversarial +robustness. To address this issue, we highlight two criteria of robust +representation: (1) Exclusion: \emph{the feature of examples keeps away from +that of other classes}; (2) Alignment: \emph{the feature of natural and +corresponding adversarial examples is close to each other}. These motivate us +to propose a generic framework of AT to gain robust representation, by the +asymmetric negative contrast and reverse attention. Specifically, we design an +asymmetric negative contrast based on predicted probabilities, to push away +examples of different classes in the feature space. Moreover, we propose to +weight feature by parameters of the linear classifier as the reverse attention, +to obtain class-aware feature and pull close the feature of the same class. +Empirical evaluations on three benchmark datasets show our methods greatly +advance the robustness of AT and achieve state-of-the-art performance. + +
+
+ comment: 10 pages, 9 figures, Submitted to TIFS +
+
+
+
+
+ + ♻ ☆ Single-Pass Contrastive Learning Can Work for Both Homophilic and + Heterophilic Graph + + +
+ Existing graph contrastive learning (GCL) techniques typically require two +forward passes for a single instance to construct the contrastive loss, which +is effective for capturing the low-frequency signals of node features. Such a +dual-pass design has shown empirical success on homophilic graphs, but its +effectiveness on heterophilic graphs, where directly connected nodes typically +have different labels, is unknown. In addition, existing GCL approaches fail to +provide strong performance guarantees. Coupled with the unpredictability of GCL +approaches on heterophilic graphs, their applicability in real-world contexts +is limited. Then, a natural question arises: Can we design a GCL method that +works for both homophilic and heterophilic graphs with a performance guarantee? +To answer this question, we theoretically study the concentration property of +features obtained by neighborhood aggregation on homophilic and heterophilic +graphs, introduce the single-pass augmentation-free graph contrastive learning +loss based on the property, and provide performance guarantees for the +minimizer of the loss on downstream tasks. As a direct consequence of our +analysis, we implement the Single-Pass Graph Contrastive Learning method +(SP-GCL). Empirically, on 14 benchmark datasets with varying degrees of +homophily, the features learned by the SP-GCL can match or outperform existing +strong baselines with significantly less computational overhead, which +demonstrates the usefulness of our findings in real-world cases. + +
+
+ comment: This article has been accepted for publication by the Transactions on + Machine Learning Research. OpenReview at: + https://openreview.net/forum?id=244KePn09i +
+
+
+
+
+ + ♻ ☆ Graph Attention-based Deep Reinforcement Learning for solving the + Chinese Postman Problem with Load-dependent costs + + +
+ Recently, Deep reinforcement learning (DRL) models have shown promising +results in solving routing problems. However, most DRL solvers are commonly +proposed to solve node routing problems, such as the Traveling Salesman Problem +(TSP). Meanwhile, there has been limited research on applying neural methods to +arc routing problems, such as the Chinese Postman Problem (CPP), since they +often feature irregular and complex solution spaces compared to TSP. To fill +these gaps, this paper proposes a novel DRL framework to address the CPP with +load-dependent costs (CPP-LC) (Corberan et al., 2018), which is a complex arc +routing problem with load constraints. The novelty of our method is two-fold. +First, we formulate the CPP-LC as a Markov Decision Process (MDP) sequential +model. Subsequently, we introduce an autoregressive model based on DRL, namely +Arc-DRL, consisting of an encoder and decoder to address the CPP-LC challenge +effectively. Such a framework allows the DRL model to work efficiently and +scalably to arc routing problems. Furthermore, we propose a new bio-inspired +meta-heuristic solution based on Evolutionary Algorithm (EA) for CPP-LC. +Extensive experiments show that Arc-DRL outperforms existing meta-heuristic +methods such as Iterative Local Search (ILS) and Variable Neighborhood Search +(VNS) proposed by (Corberan et al., 2018) on large benchmark datasets for +CPP-LC regarding both solution quality and running time; while the EA gives the +best solution quality with much more running time. We release our C++ +implementations for metaheuristics such as EA, ILS and VNS along with the code +for data generation and our generated data at +https://github.com/HySonLab/Chinese_Postman_Problem + +
+
+
+
+
+ + ♻ ☆ Diffusion Model-Augmented Behavioral Cloning + + +
+ Imitation learning addresses the challenge of learning by observing an +expert's demonstrations without access to reward signals from environments. +Most existing imitation learning methods that do not require interacting with +environments either model the expert distribution as the conditional +probability p(a|s) (e.g., behavioral cloning, BC) or the joint probability p(s, +a). Despite its simplicity, modeling the conditional probability with BC +usually struggles with generalization. While modeling the joint probability can +lead to improved generalization performance, the inference procedure is often +time-consuming and the model can suffer from manifold overfitting. This work +proposes an imitation learning framework that benefits from modeling both the +conditional and joint probability of the expert distribution. Our proposed +diffusion model-augmented behavioral cloning (DBC) employs a diffusion model +trained to model expert behaviors and learns a policy to optimize both the BC +loss (conditional) and our proposed diffusion model loss (joint). DBC +outperforms baselines in various continuous control tasks in navigation, robot +arm manipulation, dexterous manipulation, and locomotion. We design additional +experiments to verify the limitations of modeling either the conditional +probability or the joint probability of the expert distribution as well as +compare different generative models. Ablation studies justify the effectiveness +of our design choices. + +
+
+
+
+
+ + ♻ ☆ Mitigating Source Bias for Fairer Weak Supervision + + +
+ Weak supervision enables efficient development of training sets by reducing +the need for ground truth labels. However, the techniques that make weak +supervision attractive -- such as integrating any source of signal to estimate +unknown labels -- also entail the danger that the produced pseudolabels are +highly biased. Surprisingly, given everyday use and the potential for increased +bias, weak supervision has not been studied from the point of view of fairness. +We begin such a study, starting with the observation that even when a fair +model can be built from a dataset with access to ground-truth labels, the +corresponding dataset labeled via weak supervision can be arbitrarily unfair. +To address this, we propose and empirically validate a model for source +unfairness in weak supervision, then introduce a simple counterfactual +fairness-based technique that can mitigate these biases. Theoretically, we show +that it is possible for our approach to simultaneously improve both accuracy +and fairness -- in contrast to standard fairness approaches that suffer from +tradeoffs. Empirically, we show that our technique improves accuracy on weak +supervision baselines by as much as 32\% while reducing demographic parity gap +by 82.5\%. A simple extension of our method aimed at maximizing performance +produces state-of-the-art performance in five out of ten datasets in the WRENCH +benchmark. + +
+
+ comment: 42 pages +
+
+
+
+
+ + ♻ ☆ StyleTTS: A Style-Based Generative Model for Natural and Diverse + Text-to-Speech Synthesis + + +
+ Text-to-Speech (TTS) has recently seen great progress in synthesizing +high-quality speech owing to the rapid development of parallel TTS systems, but +producing speech with naturalistic prosodic variations, speaking styles and +emotional tones remains challenging. Moreover, since duration and speech are +generated separately, parallel TTS models still have problems finding the best +monotonic alignments that are crucial for naturalistic speech synthesis. Here, +we propose StyleTTS, a style-based generative model for parallel TTS that can +synthesize diverse speech with natural prosody from a reference speech +utterance. With novel Transferable Monotonic Aligner (TMA) and +duration-invariant data augmentation schemes, our method significantly +outperforms state-of-the-art models on both single and multi-speaker datasets +in subjective tests of speech naturalness and speaker similarity. Through +self-supervised learning of the speaking styles, our model can synthesize +speech with the same prosodic and emotional tone as any given reference speech +without the need for explicitly labeling these categories. + +
+
+
+
+
+ + ♻ ☆ StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion + and Adversarial Training with Large Speech Language Models NeurIPS 2023 + + +
+ In this paper, we present StyleTTS 2, a text-to-speech (TTS) model that +leverages style diffusion and adversarial training with large speech language +models (SLMs) to achieve human-level TTS synthesis. StyleTTS 2 differs from its +predecessor by modeling styles as a latent random variable through diffusion +models to generate the most suitable style for the text without requiring +reference speech, achieving efficient latent diffusion while benefiting from +the diverse speech synthesis offered by diffusion models. Furthermore, we +employ large pre-trained SLMs, such as WavLM, as discriminators with our novel +differentiable duration modeling for end-to-end training, resulting in improved +speech naturalness. StyleTTS 2 surpasses human recordings on the single-speaker +LJSpeech dataset and matches it on the multispeaker VCTK dataset as judged by +native English speakers. Moreover, when trained on the LibriTTS dataset, our +model outperforms previous publicly available models for zero-shot speaker +adaptation. This work achieves the first human-level TTS on both single and +multispeaker datasets, showcasing the potential of style diffusion and +adversarial training with large SLMs. The audio demos and source code are +available at https://styletts2.github.io/. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Advancing Bayesian Optimization via Learning Correlated Latent Space + + +
+ Bayesian optimization is a powerful method for optimizing black-box functions +with limited function evaluations. Recent works have shown that optimization in +a latent space through deep generative models such as variational autoencoders +leads to effective and efficient Bayesian optimization for structured or +discrete data. However, as the optimization does not take place in the input +space, it leads to an inherent gap that results in potentially suboptimal +solutions. To alleviate the discrepancy, we propose Correlated latent space +Bayesian Optimization (CoBO), which focuses on learning correlated latent +spaces characterized by a strong correlation between the distances in the +latent space and the distances within the objective function. Specifically, our +method introduces Lipschitz regularization, loss weighting, and trust region +recoordination to minimize the inherent gap around the promising areas. We +demonstrate the effectiveness of our approach on several optimization tasks in +discrete data, such as molecule design and arithmetic expression fitting, and +achieve high performance within a small budget. + +
+
+
+
+
+ + ♻ ☆ On Practical Robust Reinforcement Learning: Practical Uncertainty Set + and Double-Agent Algorithm + + +
+ Robust reinforcement learning (RRL) aims at seeking a robust policy to +optimize the worst case performance over an uncertainty set of Markov decision +processes (MDPs). This set contains some perturbed MDPs from a nominal MDP +(N-MDP) that generate samples for training, which reflects some potential +mismatches between training (i.e., N-MDP) and true environments. In this paper +we present an elaborated uncertainty set by excluding some implausible MDPs +from the existing sets. Under this uncertainty set, we develop a sample-based +RRL algorithm (named ARQ-Learning) for tabular setting and characterize its +finite-time error bound. Also, it is proved that ARQ-Learning converges as fast +as the standard Q-Learning and robust Q-Learning while ensuring better +robustness. We introduce an additional pessimistic agent which can tackle the +major bottleneck for the extension of ARQ-Learning into the cases with larger +or continuous state spaces. Incorporating this idea into RL algorithms, we +propose double-agent algorithms for model-free RRL. Via experiments, we +demonstrate the effectiveness of the proposed algorithms. + +
+
+
+
+
+ + ♻ ☆ Battle of the Backbones: A Large-Scale Comparison of Pretrained Models + across Computer Vision Tasks NeurIPS 2023 + + +
+ Neural network based computer vision systems are typically built on a +backbone, a pretrained or randomly initialized feature extractor. Several years +ago, the default option was an ImageNet-trained convolutional neural network. +However, the recent past has seen the emergence of countless backbones +pretrained using various algorithms and datasets. While this abundance of +choice has led to performance increases for a range of systems, it is difficult +for practitioners to make informed decisions about which backbone to choose. +Battle of the Backbones (BoB) makes this choice easier by benchmarking a +diverse suite of pretrained models, including vision-language models, those +trained via self-supervised learning, and the Stable Diffusion backbone, across +a diverse set of computer vision tasks ranging from classification to object +detection to OOD generalization and more. Furthermore, BoB sheds light on +promising directions for the research community to advance computer vision by +illuminating strengths and weakness of existing approaches through a +comprehensive analysis conducted on more than 1500 training runs. While vision +transformers (ViTs) and self-supervised learning (SSL) are increasingly +popular, we find that convolutional neural networks pretrained in a supervised +fashion on large training sets still perform best on most tasks among the +models we consider. Moreover, in apples-to-apples comparisons on the same +architectures and similarly sized pretraining datasets, we find that SSL +backbones are highly competitive, indicating that future works should perform +SSL pretraining with advanced architectures and larger pretraining datasets. We +release the raw results of our experiments along with code that allows +researchers to put their own backbones through the gauntlet here: +https://github.com/hsouri/Battle-of-the-Backbones + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Finding emergence in data: causal emergence inspired dynamics learning + + +
+ Modelling complex dynamical systems in a data-driven manner is challenging +due to the presence of emergent behaviors and properties that cannot be +directly captured by micro-level observational data. Therefore, it is crucial +to develop a model that can effectively capture emergent dynamics at the +macro-level and quantify emergence based on the available data. Drawing +inspiration from the theory of causal emergence, this paper introduces a +machine learning framework aimed at learning macro-dynamics within an emergent +latent space. The framework achieves this by maximizing the effective +information (EI) to obtain a macro-dynamics model with stronger causal effects. +Experimental results on both simulated and real data demonstrate the +effectiveness of the proposed framework. Not only does it successfully capture +emergent patterns, but it also learns the coarse-graining strategy and +quantifies the degree of causal emergence in the data. Furthermore, experiments +conducted on environments different from the training dataset highlight the +superior generalization ability of our model. + +
+
+
+
+
+ + ♻ ☆ LymphoML: An interpretable artificial intelligence-based method + identifies morphologic features that correlate with lymphoma subtype + + +
+ The accurate classification of lymphoma subtypes using hematoxylin and eosin +(H&E)-stained tissue is complicated by the wide range of morphological features +these cancers can exhibit. We present LymphoML - an interpretable machine +learning method that identifies morphologic features that correlate with +lymphoma subtypes. Our method applies steps to process H&E-stained tissue +microarray cores, segment nuclei and cells, compute features encompassing +morphology, texture, and architecture, and train gradient-boosted models to +make diagnostic predictions. LymphoML's interpretable models, developed on a +limited volume of H&E-stained tissue, achieve non-inferior diagnostic accuracy +to pathologists using whole-slide images and outperform black box deep-learning +on a dataset of 670 cases from Guatemala spanning 8 lymphoma subtypes. Using +SHapley Additive exPlanation (SHAP) analysis, we assess the impact of each +feature on model prediction and find that nuclear shape features are most +discriminative for DLBCL (F1-score: 78.7%) and classical Hodgkin lymphoma +(F1-score: 74.5%). Finally, we provide the first demonstration that a model +combining features from H&E-stained tissue with features from a standardized +panel of 6 immunostains results in a similar diagnostic accuracy (85.3%) to a +46-stain panel (86.1%). + +
+
+ comment: To be published in Proceedings of the 3rd Machine Learning for Health + symposium, Proceedings of Machine Learning Research (PMLR) +
+
+
+
+
+ + ♻ ☆ UniMOS: A Universal Framework For Multi-Organ Segmentation Over + Label-Constrained Datasets + + +
+ Machine learning models for medical images can help physicians diagnose and +manage diseases. However, due to the fact that medical image annotation +requires a great deal of manpower and expertise, as well as the fact that +clinical departments perform image annotation based on task orientation, there +is the problem of having fewer medical image annotation data with more +unlabeled data and having many datasets that annotate only a single organ. In +this paper, we present UniMOS, the first universal framework for achieving the +utilization of fully and partially labeled images as well as unlabeled images. +Specifically, we construct a Multi-Organ Segmentation (MOS) module over +fully/partially labeled data as the basenet and designed a new target adaptive +loss. Furthermore, we incorporate a semi-supervised training module that +combines consistent regularization and pseudolabeling techniques on unlabeled +data, which significantly improves the segmentation of unlabeled data. +Experiments show that the framework exhibits excellent performance in several +medical image segmentation tasks compared to other advanced methods, and also +significantly improves data utilization and reduces annotation cost. Code and +models are available at: https://github.com/lw8807001/UniMOS. + +
+
+ comment: Accepted by BIBM2023 +
+
+
+
+
+ + ♻ ☆ Landmark Attention: Random-Access Infinite Context Length for + Transformers NeurIPS 2023 + + +
+ While Transformers have shown remarkable success in natural language +processing, their attention mechanism's large memory requirements have limited +their ability to handle longer contexts. Prior approaches, such as recurrent +memory or retrieval-based augmentation, have either compromised the +random-access flexibility of attention (i.e., the capability to select any +token in the entire context) or relied on separate mechanisms for relevant +context retrieval, which may not be compatible with the model's attention. In +this paper, we present a novel approach that allows access to the complete +context while retaining random-access flexibility, closely resembling running +attention on the entire context. Our method uses a landmark token to represent +each block of the input and trains the attention to use it for selecting +relevant blocks, enabling retrieval of blocks directly through the attention +mechanism instead of by relying on a separate mechanism. Our approach +seamlessly integrates with specialized data structures and the system's memory +hierarchy, enabling processing of arbitrarily long context lengths. We +demonstrate that our method can obtain comparable performance with +Transformer-XL while significantly reducing the number of retrieved tokens in +each step. Finally, we show that fine-tuning LLaMA 7B with our method +successfully extends its context length capacity to over 32k tokens, allowing +for inference at the context lengths of GPT-4. We release the implementation of +landmark attention and the code to reproduce our experiments at +https://github.com/epfml/landmark-attention/. + +
+
+ comment: Published as a conference paper at NeurIPS 2023 - 37th Conference on + Neural Information Processing Systems +
+
+
+
+
+ + ♻ ☆ SAM-CLIP: Merging Vision Foundation Models towards Semantic and Spatial + Understanding + + +
+ The landscape of publicly available vision foundation models (VFMs), such as +CLIP and Segment Anything Model (SAM), is expanding rapidly. VFMs are endowed +with distinct capabilities stemming from their pre-training objectives. For +instance, CLIP excels in semantic understanding, while SAM specializes in +spatial understanding for segmentation. In this work, we introduce a simple +recipe to efficiently merge VFMs into a unified model that absorbs their +expertise. Our method integrates techniques of multi-task learning, continual +learning, and distillation. Further, it demands significantly less +computational cost compared to traditional multi-task training from scratch, +and it only needs a small fraction of the pre-training datasets that were +initially used to train individual models. By applying our method to SAM and +CLIP, we obtain SAM-CLIP: a unified model that combines the capabilities of SAM +and CLIP into a single vision transformer. Compared with deploying SAM and CLIP +independently, our merged model, SAM-CLIP, reduces storage and compute costs +for inference, making it well-suited for edge device applications. We show that +SAM-CLIP not only retains the foundational strengths of SAM and CLIP, but also +introduces synergistic functionalities, notably in zero-shot semantic +segmentation, where SAM-CLIP establishes new state-of-the-art results on 5 +benchmarks. It outperforms previous models that are specifically designed for +this task by a large margin, including +6.8% and +5.9% mean IoU improvement on +Pascal-VOC and COCO-Stuff datasets, respectively. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Multimodal Characterization of Emotion within Multimedia Space + + +
+ Technological advancement and its omnipresent connection have pushed humans +past the boundaries and limitations of a computer screen, physical state, or +geographical location. It has provided a depth of avenues that facilitate +human-computer interaction that was once inconceivable such as audio and body +language detection. Given the complex modularities of emotions, it becomes +vital to study human-computer interaction, as it is the commencement of a +thorough understanding of the emotional state of users and, in the context of +social networks, the producers of multimodal information. This study first +acknowledges the accuracy of classification found within multimodal emotion +detection systems compared to unimodal solutions. Second, it explores the +characterization of multimedia content produced based on their emotions and the +coherence of emotion in different modalities by utilizing deep learning models +to classify emotion across different modalities. + +
+
+ comment: 8 pages, Published in International Conference on Computers and + Computation (COMPUTE 2022), November 03-04, 2022, San Francisco, United + States +
+
+
+
+
+ + ☆ CityScope: Enhanced Localozation and Synchronizing AR for Dynamic Urban + Weather Visualization + + +
+ CityScope uses augmented reality (AR) to change our interaction with weather +data. The main goal is to develop real-time 3D weather visualizations, with +Taiwan as the model. It displays live weather data from the Central Weather +Bureau (CWB), projected onto a physical representation of Taiwan's landscape. A +pivotal advancement in our project is the integration of AprilTag with plane +detection technology. This innovative combination significantly enhances the +precision of the virtual visualizations within the physical world. By +accurately aligning AR elements with real-world environments, CityScope +achieves a seamless and realistic amalgamation of weather data and the physical +terrain of Taiwan. This breakthrough in AR technology not only enhances the +accuracy of weather visualizations but also enriches user experience, offering +an immersive and interactive way to understand and engage with meteorological +information. CityScope stands as a testament to the potential of AR in +transforming data visualization and public engagement in meteorology. + +
+
+ comment: 9 pages, 15 figures +
+
+
+
+
+ + ☆ Conditional Modeling Based Automatic Video Summarization + + +
+ The aim of video summarization is to shorten videos automatically while +retaining the key information necessary to convey the overall story. Video +summarization methods mainly rely on visual factors, such as visual +consecutiveness and diversity, which may not be sufficient to fully understand +the content of the video. There are other non-visual factors, such as +interestingness, representativeness, and storyline consistency that should also +be considered for generating high-quality video summaries. Current methods do +not adequately take into account these non-visual factors, resulting in +suboptimal performance. In this work, a new approach to video summarization is +proposed based on insights gained from how humans create ground truth video +summaries. The method utilizes a conditional modeling perspective and +introduces multiple meaningful random variables and joint distributions to +characterize the key components of video summarization. Helper distributions +are employed to improve the training of the model. A conditional attention +module is designed to mitigate potential performance degradation in the +presence of multi-modal input. The proposed video summarization method +incorporates the above innovative design choices that aim to narrow the gap +between human-generated and machine-generated video summaries. Extensive +experiments show that the proposed approach outperforms existing methods and +achieves state-of-the-art performance on commonly used video summarization +datasets. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + arXiv admin note: substantial text overlap with arXiv:2305.00455 +
+
+
+
+
+ + ☆ Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging + + +
+ Video face re-aging deals with altering the apparent age of a person to the +target age in videos. This problem is challenging due to the lack of paired +video datasets maintaining temporal consistency in identity and age. Most +re-aging methods process each image individually without considering the +temporal consistency of videos. While some existing works address the issue of +temporal coherence through video facial attribute manipulation in latent space, +they often fail to deliver satisfactory performance in age transformation. To +tackle the issues, we propose (1) a novel synthetic video dataset that features +subjects across a diverse range of age groups; (2) a baseline architecture +designed to validate the effectiveness of our proposed dataset, and (3) the +development of three novel metrics tailored explicitly for evaluating the +temporal consistency of video re-aging techniques. Our comprehensive +experiments on public datasets, such as VFHQ and CelebV-HQ, show that our +method outperforms the existing approaches in terms of both age transformation +and temporal consistency. + +
+
+ comment: 8 pages, 6 figures, 4 tables +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`